// CuNNy 4x12 - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.


//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-04x12
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, x, y) t.SampleLevel(SP, pos + float2(x, y) * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T4;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T5;

//!PASS 1
//!DESC in (3x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT T0, T1, T2

#define L0(x, y) V3(O(INPUT, x, y).rgb)
#define V3 MF3
#define M3x4 MF3x4

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V3 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	r0 = V4(2.371e-02, 1.429e-03, -1.518e-03, -1.126e-03);
	r1 = V4(-8.419e-01, -3.915e-04, 7.733e-04, 1.825e-07);
	r2 = V4(-3.869e-02, 5.169e-03, -1.235e-04, -4.538e-04);
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M3x4(-7.386e-03, -5.469e-02, 1.746e-02, -7.538e-02, -4.545e-02, -4.873e-01, -4.916e-02, -1.283e-01, -2.024e-03, -3.993e-02, 5.444e-03, 2.504e-01), r0);
	r1 = MulAdd(s0_0_0, M3x4(-7.243e-02, 5.732e-04, 1.089e-01, 2.781e-02, -6.259e-02, 1.818e-03, 5.067e-01, -1.359e-03, 3.032e-02, -5.970e-03, -7.168e-03, -4.071e-03), r1);
	r2 = MulAdd(s0_0_0, M3x4(-3.235e-03, -8.249e-03, 5.048e-03, 2.856e-03, 1.328e-02, 5.994e-02, 2.688e-02, -9.917e-03, -7.008e-03, 1.200e-02, -5.993e-05, 1.259e-02), r2);
	r0 = MulAdd(s0_0_1, M3x4(-4.481e-02, -4.577e-02, -2.425e-01, 8.612e-02, -1.968e-01, 2.938e-02, -9.121e-01, -7.690e-02, -1.899e-02, 3.110e-03, -1.540e-01, -1.780e+00), r0);
	r1 = MulAdd(s0_0_1, M3x4(-1.152e-01, 1.819e-02, 1.569e-02, 2.558e-01, -6.440e-02, -3.147e-02, 5.094e-02, 8.379e-01, -8.025e-02, 9.843e-03, 1.203e-02, 7.306e-02), r1);
	r2 = MulAdd(s0_0_1, M3x4(-1.968e-03, -2.847e-03, -4.148e-03, -2.091e-01, -6.473e-03, 9.986e-03, 4.125e-02, -8.573e-01, 6.614e-03, -3.453e-02, 2.267e-03, -6.174e-02), r2);
	r0 = MulAdd(s0_0_2, M3x4(-4.561e-03, 6.066e-02, 2.461e-01, -6.254e-02, -3.035e-02, 1.289e-01, 8.730e-01, -9.420e-03, -7.786e-04, 1.935e-02, 1.362e-01, 3.951e-01), r0);
	r1 = MulAdd(s0_0_2, M3x4(1.164e+00, -5.721e-04, -2.677e-02, -2.235e-02, 5.593e-01, 1.732e-02, 3.845e-02, 2.547e-02, 1.626e-01, -1.258e-02, -1.786e-02, -6.332e-03), r1);
	r2 = MulAdd(s0_0_2, M3x4(-5.411e-03, -1.579e-02, 6.164e-05, 3.549e-03, -1.140e-03, -7.766e-02, -1.922e-02, -2.055e-02, -3.550e-03, 1.331e-02, 1.014e-03, -1.736e-03), r2);
	r0 = MulAdd(s0_1_0, M3x4(-5.236e-02, -1.606e-01, -1.882e-02, 6.696e-03, -1.711e-01, -3.822e-01, 4.992e-02, 9.131e-02, -2.025e-02, -2.197e-02, -9.928e-03, 4.434e-02), r0);
	r1 = MulAdd(s0_1_0, M3x4(7.801e-04, 1.954e-03, -1.577e-01, -2.695e-01, -9.303e-03, -8.061e-03, -6.738e-01, -8.372e-01, 2.277e-03, 4.330e-03, 2.384e-02, -9.592e-02), r1);
	r2 = MulAdd(s0_1_0, M3x4(4.204e-02, 4.112e-02, -6.321e-03, -9.850e-03, 8.119e-02, -4.919e-02, 5.286e-03, 2.945e-03, 1.587e-02, -4.608e-03, 6.811e-03, -9.825e-03), r2);
	r0 = MulAdd(s0_1_1, M3x4(1.547e-01, 1.787e-01, 2.570e-01, -1.229e-02, 7.285e-01, 6.604e-01, 8.773e-01, 1.478e-01, 5.629e-02, 6.842e-02, 1.870e-01, 2.441e-01), r0);
	r1 = MulAdd(s0_1_1, M3x4(-5.694e-02, -1.830e-01, 2.021e-02, -1.896e-02, -5.155e-02, -8.613e-01, 5.923e-02, -2.199e-02, -4.334e-02, -5.851e-02, -2.120e-02, 4.841e-02), r1);
	r2 = MulAdd(s0_1_1, M3x4(5.260e-02, 3.307e-02, 1.349e-01, 2.220e-01, 3.151e-01, 3.018e-01, 5.776e-01, 8.574e-01, 2.537e-02, 6.137e-02, 4.454e-02, 4.895e-02), r2);
	r0 = MulAdd(s0_1_2, M3x4(-1.901e-02, -3.397e-04, -2.533e-01, 6.013e-02, -2.369e-02, 9.543e-02, -8.429e-01, 1.395e-02, 2.221e-03, -9.763e-03, -1.653e-01, -8.905e-02), r0);
	r1 = MulAdd(s0_1_2, M3x4(-2.009e-01, 1.782e-01, 1.372e-02, -3.673e-03, -2.411e-01, 8.752e-01, -2.155e-02, 1.346e-02, -2.332e-02, 4.865e-02, -5.773e-03, 1.563e-03), r1);
	r2 = MulAdd(s0_1_2, M3x4(2.545e-03, -2.133e-01, -2.288e-02, -7.434e-03, 7.792e-02, -8.770e-01, -7.027e-02, 2.110e-02, 3.652e-03, 5.824e-02, -2.055e-02, 8.632e-03), r2);
	r0 = MulAdd(s0_2_0, M3x4(-4.618e-03, 7.533e-02, 3.191e-04, 2.329e-02, -5.662e-03, 2.104e-01, -1.219e-02, -5.316e-02, 4.399e-03, 8.012e-03, 9.658e-03, -4.013e-03), r0);
	r1 = MulAdd(s0_2_0, M3x4(1.214e-02, -4.554e-03, 1.340e-02, 2.632e-02, 1.650e-02, -7.431e-05, -8.060e-03, -1.084e-03, -2.011e-02, -4.600e-04, -2.877e-02, -1.232e-02), r1);
	r2 = MulAdd(s0_2_0, M3x4(6.979e-02, -2.692e-02, 7.482e-03, 6.417e-03, 3.348e-01, 3.505e-02, 3.129e-02, 5.542e-03, -1.225e-02, -1.401e-02, -1.266e-03, 8.324e-04), r2);
	r0 = MulAdd(s0_2_1, M3x4(-1.724e-03, 5.198e-03, 1.529e-03, -2.257e-02, 3.968e-03, 5.456e-02, 6.396e-02, 1.018e-01, 1.426e-03, -8.712e-03, 7.497e-03, 2.531e-02), r0);
	r1 = MulAdd(s0_2_1, M3x4(-1.016e-03, -1.151e-03, -2.179e-02, 2.617e-04, 2.426e-03, -2.212e-02, 1.129e-02, -7.126e-03, -3.228e-02, 2.641e-02, -1.260e-03, -6.232e-03), r1);
	r2 = MulAdd(s0_2_1, M3x4(-2.182e-01, 3.730e-02, -2.093e-02, -1.364e-02, -1.346e+00, 1.938e-01, -4.649e-02, 5.817e-03, -3.633e-02, -2.405e-02, -1.693e-02, 5.099e-04), r2);
	r0 = MulAdd(s0_2_2, M3x4(1.103e-04, -6.213e-02, -1.111e-02, 3.962e-03, 7.550e-03, -3.073e-01, -4.484e-02, -4.740e-02, -9.361e-03, -1.714e-02, -1.528e-02, 3.008e-02), r0);
	r1 = MulAdd(s0_2_2, M3x4(-3.040e-02, -9.259e-03, -3.092e-03, 2.860e-03, -6.506e-03, 2.856e-02, -3.184e-03, -6.592e-03, 9.029e-03, -1.179e-02, 2.493e-02, 2.025e-03), r1);
	r2 = MulAdd(s0_2_2, M3x4(2.447e-02, 1.106e-01, 5.033e-03, 4.270e-03, 5.833e-02, 3.534e-01, 5.783e-03, -7.361e-03, -2.033e-02, -4.764e-02, 1.376e-02, 2.350e-03), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 2
//!DESC conv1 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1, T2
//!OUT T3, T4, T5

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(1.163e-01, 1.156e-01, 1.457e-01, -6.471e-02, -1.618e-02, 9.700e-02, -1.586e-01, 1.539e-02, 1.924e-02, 1.621e-02, -1.343e-01, -4.331e-02, 5.127e-03, 8.125e-02, -1.768e-02, 1.399e-02), r0);
	r1 = MulAdd(s0_0_0, M4(4.747e-02, -1.428e-01, -3.318e-01, 4.421e-02, -9.593e-03, 9.177e-02, 5.804e-02, -2.659e-03, -8.757e-03, 2.132e-01, 1.320e-01, -5.020e-02, -1.620e-03, -7.543e-02, -1.298e-01, -1.489e-02), r1);
	r2 = MulAdd(s0_0_0, M4(2.494e-01, 3.677e-01, 8.672e-02, 1.938e-02, -6.616e-02, -2.685e-02, -1.523e-01, -4.285e-02, 5.086e-02, -6.759e-02, 8.454e-02, 2.415e-03, -2.521e-02, 8.552e-02, -1.935e-02, -3.447e-02), r2);
	r0 = MulAdd(s0_0_1, M4(2.002e-02, 2.456e-01, 4.244e-01, -1.283e-01, 6.184e-03, 2.605e-01, 2.199e-01, -2.072e-02, 2.171e-03, -1.605e-01, 4.502e-02, -2.313e-02, -4.377e-02, 1.323e-01, -8.970e-02, 2.607e-02), r0);
	r1 = MulAdd(s0_0_1, M4(7.455e-02, -3.308e-01, -6.099e-01, 9.643e-02, 2.136e-02, -1.195e-01, -3.679e-02, -8.183e-02, -1.535e-02, 8.867e-02, 1.545e-01, -2.720e-02, -1.320e-02, -1.139e-01, -4.386e-02, 2.701e-02), r1);
	r2 = MulAdd(s0_0_1, M4(-6.660e-01, -3.999e-01, -5.881e-02, -3.469e-01, -7.218e-02, -4.373e-02, -3.779e-01, -1.317e-01, 4.652e-02, 2.815e-03, 1.035e-03, 2.925e-02, -9.762e-02, -8.644e-03, -8.729e-02, -3.912e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-1.642e-01, -1.925e-01, 1.791e-01, 4.927e-02, 2.007e-02, 1.683e-01, -2.002e-01, 6.172e-04, -3.581e-04, -6.771e-02, 1.810e-01, -4.235e-03, 1.902e-02, 7.933e-02, 1.135e-01, -1.783e-02), r0);
	r1 = MulAdd(s0_0_2, M4(-6.026e-02, -2.214e-01, 2.219e-01, -2.853e-02, 5.208e-02, -4.546e-02, 9.622e-03, -1.328e-02, -7.084e-03, 4.145e-03, 5.873e-02, -1.877e-02, 1.708e-02, 1.208e-01, -1.322e-02, 5.464e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-1.215e-01, -8.505e-02, 2.639e-01, -3.397e-01, -6.812e-02, -1.726e-02, -2.596e-01, -1.450e-01, 1.194e-02, 3.039e-02, -8.624e-02, 5.425e-02, 3.149e-02, 1.806e-02, 1.849e-01, 8.528e-03), r2);
	r0 = MulAdd(s0_1_0, M4(-5.927e-02, -2.056e-01, -7.717e-02, 1.800e-01, -2.715e-02, -8.305e-02, 1.303e-01, -1.972e-02, 7.263e-02, 1.587e-01, -2.244e+00, -4.746e-02, -3.545e-02, -1.343e-01, 4.560e-02, 1.635e-02), r0);
	r1 = MulAdd(s0_1_0, M4(6.715e-02, -2.241e-01, -3.088e-01, -8.333e-02, -3.748e-02, 5.220e-02, 9.541e-02, -4.337e-02, 4.700e-02, 1.081e-01, -1.407e-01, 1.372e-03, -1.445e-02, -1.451e-01, 8.263e-02, -5.546e-02), r1);
	r2 = MulAdd(s0_1_0, M4(1.820e-01, -6.715e-02, 3.887e-01, 1.081e-01, -1.096e-01, 6.740e-02, -1.767e-01, -2.906e-02, 4.365e-01, 4.421e-02, 1.038e-01, -1.864e-02, 4.105e-02, 1.160e-01, -1.388e-01, 3.874e-02), r2);
	r0 = MulAdd(s0_1_1, M4(5.063e-01, 9.199e-01, 4.362e-01, 5.728e-01, -1.160e-02, -6.584e-02, -3.739e-01, 5.988e-02, 9.301e-02, 5.149e-02, -1.641e-02, -4.545e-02, -6.133e-02, -1.672e-01, -2.886e-01, 5.674e-02), r0);
	r1 = MulAdd(s0_1_1, M4(-3.628e-01, 6.313e-02, 7.062e-01, 3.819e-01, -3.538e-02, 2.900e-01, -9.975e-03, -2.518e-02, -2.022e-02, -2.613e-01, 9.266e-02, 5.188e-02, -3.095e-02, 1.751e-01, -2.423e-01, 2.622e-02), r1);
	r2 = MulAdd(s0_1_1, M4(2.380e-01, 1.193e-01, -7.730e-02, 5.914e-01, -3.413e-02, -1.030e-01, -6.568e-01, -3.359e-01, 2.100e-01, -1.267e-02, 4.550e-04, 4.862e-02, -1.263e-03, -9.311e-02, -3.138e-02, 7.157e-02), r2);
	r0 = MulAdd(s0_1_2, M4(-4.124e-02, -6.117e-02, -3.430e-01, 2.183e-01, 4.256e-02, -6.017e-02, 2.253e-01, 2.906e-02, 1.328e-03, -1.938e-01, 2.511e-01, -1.271e-02, 5.325e-03, -1.294e-01, -1.397e-01, 2.712e-02), r0);
	r1 = MulAdd(s0_1_2, M4(2.510e-01, 5.593e-01, 2.391e-01, 2.250e-01, 9.170e-02, -1.985e-01, 2.144e-02, 8.525e-03, -3.699e-02, 1.579e-01, 4.856e-02, -5.507e-02, 8.173e-02, 7.230e-02, -7.690e-02, 6.204e-02), r1);
	r2 = MulAdd(s0_1_2, M4(-1.216e-01, -7.150e-02, -4.137e-01, -3.132e-01, 7.101e-02, 4.673e-02, -1.607e-01, -5.207e-03, -5.130e-02, 2.507e-02, 9.695e-02, 8.657e-02, 5.831e-02, 3.104e-03, 7.544e-02, 5.899e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-8.765e-02, -3.692e-01, -7.526e-02, -1.535e-02, 6.240e-03, 2.573e-02, 3.032e-02, -9.455e-03, 4.085e-02, 1.783e-01, 5.481e-02, -5.315e-02, -5.469e-02, 2.560e-02, 3.218e-02, 7.544e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-7.377e-02, 2.922e-02, -7.358e-02, -1.411e-01, 7.064e-03, -2.110e-03, 6.317e-03, -1.400e-02, -5.536e-02, -4.443e-01, -4.341e-02, -3.408e-03, -5.640e-04, 7.144e-02, -1.986e-01, -2.526e-02), r1);
	r2 = MulAdd(s0_2_0, M4(-2.548e-01, 1.914e-02, 1.738e-01, -4.575e-01, -4.725e-02, 5.841e-03, -2.669e-01, -2.830e-02, 1.403e-01, -3.690e-02, 7.145e-02, -7.619e-02, -7.499e-02, -5.715e-02, -4.560e-02, -6.995e-02), r2);
	r0 = MulAdd(s0_2_1, M4(-1.990e-01, -3.749e-01, -2.455e-01, 1.485e-01, -5.685e-02, -2.182e-01, 2.371e-01, 3.915e-02, -4.616e-03, 2.107e-01, -2.549e-02, -7.563e-02, -8.913e-02, -2.548e-01, -1.722e-01, 2.370e-01), r0);
	r1 = MulAdd(s0_2_1, M4(-1.354e-02, 5.682e-01, 1.932e-01, -1.618e-01, 4.247e-02, 2.062e-01, 1.100e-01, 2.448e-02, -1.304e-01, -3.076e-01, -1.084e-01, 1.746e-02, 6.326e-02, 4.109e-01, 3.020e-01, 4.791e-01), r1);
	r2 = MulAdd(s0_2_1, M4(4.093e-01, 4.491e-02, 1.438e-01, 1.566e-01, -1.051e-01, 3.127e-02, -1.009e-01, -1.145e-01, 5.644e-02, -5.117e-03, 6.510e-02, 1.145e-01, -2.455e-01, 1.439e-02, -2.855e-01, 1.577e-01), r2);
	r0 = MulAdd(s0_2_2, M4(2.122e-02, -9.808e-02, -2.008e-01, -1.844e-01, -9.780e-03, 6.983e-02, -4.225e-02, 2.338e-02, -3.277e-03, 8.374e-03, 1.274e-01, -1.788e-02, 4.089e-02, 1.455e-01, 2.359e-03, 2.758e-02), r0);
	r1 = MulAdd(s0_2_2, M4(1.599e-01, -1.459e-01, -3.136e-01, -2.532e-01, 2.324e-02, 4.829e-02, 3.945e-02, 2.026e-02, -1.981e-02, 5.712e-02, 9.496e-02, -5.237e-02, 1.394e-01, 3.940e-02, 7.466e-02, 5.042e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-7.073e-02, 5.994e-02, -4.630e-01, 9.550e-02, 4.291e-04, 1.232e-02, -2.183e-01, -7.130e-02, -1.984e-02, -2.565e-02, 1.065e-01, 1.063e-02, -1.149e-01, 9.814e-03, -1.312e-01, -4.967e-02), r2);
	r0 = MulAdd(s1_0_0, M4(1.266e-03, -7.660e-02, 6.654e-02, 1.252e-02, 4.065e-02, -2.436e-01, 6.218e-02, 1.169e-01, 2.334e-02, -3.310e-03, 6.085e-03, -3.955e-03, -1.004e-02, -5.177e-03, 3.345e-02, 6.903e-03), r0);
	r1 = MulAdd(s1_0_0, M4(2.281e-03, 3.444e-02, -5.487e-02, -3.204e-03, 3.302e-02, 2.775e-01, 2.703e-01, -2.022e-02, -1.340e-02, 1.124e-03, 9.245e-02, -6.460e-02, -4.856e-03, 5.154e-02, 3.824e-02, 2.094e-02), r1);
	r2 = MulAdd(s1_0_0, M4(1.947e-01, 9.310e-03, 6.216e-02, 2.033e-01, -7.847e-02, -1.272e+00, 4.814e-01, 1.184e-01, 2.148e-02, 2.296e-05, 1.556e-02, 1.717e-02, 8.473e-02, 5.144e-03, 5.642e-02, -2.126e-02), r2);
	r0 = MulAdd(s1_0_1, M4(-2.737e-03, -9.549e-02, -1.046e-01, 1.101e-02, -1.504e-01, -6.309e-01, -5.429e-02, 4.482e-02, 5.919e-02, 9.025e-02, 1.966e-01, -7.788e-02, -5.524e-02, -1.962e-02, -1.031e-01, 2.235e-02), r0);
	r1 = MulAdd(s1_0_1, M4(2.305e-02, 2.494e-02, -2.218e-02, 2.404e-02, -3.583e-02, 8.299e-01, -4.025e-02, -5.656e-03, 5.931e-02, 7.411e-03, -1.416e-01, -5.976e-02, -9.988e-03, 4.047e-03, 3.924e-02, -1.321e-02), r1);
	r2 = MulAdd(s1_0_1, M4(-7.245e-02, 2.901e-02, -9.027e-02, -7.411e-02, 1.909e-01, 1.330e-01, 1.605e-02, 3.563e-01, 3.619e-02, -4.567e-02, -1.969e-01, 9.875e-02, 3.314e-02, 1.746e-03, 1.292e-01, -1.363e-02), r2);
	r0 = MulAdd(s1_0_2, M4(-1.684e-02, 5.930e-04, 2.034e-03, 1.071e-02, -1.898e-02, 2.121e-02, -8.269e-03, 6.358e-03, 3.724e-02, 6.288e-02, -2.293e-01, 3.236e-02, -2.009e-02, 6.860e-02, 9.543e-02, 3.856e-03), r0);
	r1 = MulAdd(s1_0_2, M4(-1.282e-02, 2.649e-02, 3.006e-02, -2.435e-02, 2.254e-02, -7.417e-02, -1.256e-02, 2.873e-02, -4.595e-03, -1.692e-01, 4.261e-02, 6.391e-02, -8.283e-03, 1.478e-02, -8.458e-02, -5.418e-03), r1);
	r2 = MulAdd(s1_0_2, M4(-2.035e-03, 5.029e-03, 6.235e-02, 5.650e-03, -4.556e-02, -1.245e-02, 2.438e-02, -1.255e-02, -1.909e-01, 5.505e-02, -3.271e-01, -2.122e-01, 6.564e-02, -7.782e-03, 1.932e-01, 3.454e-02), r2);
	r0 = MulAdd(s1_1_0, M4(-7.247e-02, -1.497e-01, -6.152e-01, -6.253e-02, 3.747e-01, 3.875e-01, -8.220e-01, -3.239e-01, 1.289e-02, -6.893e-02, 7.039e-02, -3.210e-02, 7.061e-03, 9.071e-02, -6.554e-02, -1.396e-02), r0);
	r1 = MulAdd(s1_1_0, M4(-3.249e-02, -2.546e-01, -6.462e-02, -3.743e-02, -3.435e-02, -1.377e+00, -4.541e-01, 5.770e-02, -1.383e-02, 2.979e-02, 1.470e-01, -7.601e-02, 2.573e-02, -6.866e-02, -4.758e-02, 4.720e-02), r1);
	r2 = MulAdd(s1_1_0, M4(-1.649e-01, -1.881e-02, -1.043e-01, 1.331e-02, 5.438e-02, -6.616e-02, -1.458e-01, -5.527e-01, 3.316e-02, 3.800e-02, -6.719e-02, 9.455e-03, 8.345e-03, -7.177e-02, 7.818e-02, 4.042e-02), r2);
	r0 = MulAdd(s1_1_1, M4(-7.803e-02, -3.605e-02, -2.398e-01, 1.917e-02, 2.511e-01, -2.267e-01, 1.876e-01, -1.729e-01, 1.538e-02, -2.279e-01, 6.390e-01, -6.171e-02, 1.046e-02, 9.179e-03, -9.785e-01, -3.663e-02), r0);
	r1 = MulAdd(s1_1_1, M4(-1.102e-02, -1.066e-02, 1.134e-02, 1.200e-02, 5.475e-01, -7.988e-01, -3.063e+00, -2.201e-01, -1.164e-01, -2.261e-01, -3.449e-01, -3.471e-01, 1.974e-02, 4.171e-01, -5.344e-02, -6.616e-02), r1);
	r2 = MulAdd(s1_1_1, M4(-4.493e-02, -2.040e-02, 7.080e-02, -1.526e-01, 3.963e-02, -4.453e-02, 4.740e-01, -5.100e-01, 8.585e-02, -3.345e-02, 3.993e-01, 1.578e-02, -2.661e-05, 1.410e-01, 1.996e-01, 7.827e-02), r2);
	r0 = MulAdd(s1_1_2, M4(-3.559e-03, 3.688e-02, 1.636e-02, -1.349e-03, -1.749e-02, -1.430e-01, 9.743e-03, 9.175e-03, -4.444e-02, 9.095e-01, -3.248e-01, 2.190e-01, 1.318e-01, -3.054e+00, 1.550e-01, -9.009e-02), r0);
	r1 = MulAdd(s1_1_2, M4(2.479e-02, 1.401e-02, -9.932e-03, -2.885e-04, 1.736e-03, 1.283e-01, 3.805e-02, 1.402e-01, -1.477e-01, -3.233e-02, -5.950e+00, 6.556e-01, 1.139e-01, -1.568e-01, -2.419e+00, -1.098e-02), r1);
	r2 = MulAdd(s1_1_2, M4(4.808e-03, 2.744e-03, -3.498e-02, -1.648e-02, -1.148e-02, -4.233e-03, 4.747e-02, 3.494e-02, -2.221e-02, -6.565e-02, 1.078e-02, -2.654e-01, -1.278e-02, 2.576e-02, 2.376e-01, 1.695e-01), r2);
	r0 = MulAdd(s1_2_0, M4(-1.146e-01, -6.326e-02, 2.282e-01, 1.471e-01, 3.068e-03, 1.821e-01, 1.503e-01, 5.306e-02, 2.274e-02, 2.170e-02, 2.325e-01, -2.282e-02, 4.257e-02, 1.086e-01, -3.423e-02, -7.035e-03), r0);
	r1 = MulAdd(s1_2_0, M4(3.910e-02, 1.917e-01, 6.100e-01, 1.230e-01, -2.567e-02, 3.714e-01, 6.577e-02, 1.445e-02, 3.223e-02, -8.390e-02, -3.208e-02, -4.103e-02, -6.768e-03, 4.992e-02, 4.130e-02, 5.421e-02), r1);
	r2 = MulAdd(s1_2_0, M4(-1.788e-01, 8.515e-02, 5.660e-02, 4.852e-01, -2.444e-01, 4.538e-02, -3.718e-01, 1.655e-01, 8.216e-02, 5.839e-03, -1.204e-01, 1.093e-01, -1.041e-02, 2.283e-02, 1.597e-01, 3.630e-02), r2);
	r0 = MulAdd(s1_2_1, M4(1.185e-04, -1.779e-01, -2.445e-01, -2.418e-02, 1.743e-02, -1.275e-01, 9.508e-02, 2.757e-02, 1.389e-02, -1.383e-01, 1.173e-01, -2.857e-03, 8.423e-02, 1.210e-01, 5.521e-02, -8.439e-02), r0);
	r1 = MulAdd(s1_2_1, M4(5.963e-02, 3.690e-02, -2.565e-01, -2.116e-02, -1.534e-02, 2.914e-01, 1.096e-01, 3.019e-01, -8.793e-03, -2.056e-01, 7.105e-02, -6.085e-02, 5.072e-03, -5.783e-01, -6.557e-01, 4.832e-02), r1);
	r2 = MulAdd(s1_2_1, M4(4.574e-02, 1.974e-02, -2.873e-02, -7.527e-02, -1.452e-01, 3.639e-03, 3.845e-02, -3.287e-01, -2.881e-01, 1.305e-02, 3.056e-01, -2.282e-01, -2.982e-02, -8.287e-02, -2.300e-02, -3.526e-01), r2);
	r0 = MulAdd(s1_2_2, M4(-1.999e-03, 8.946e-02, 2.312e-02, 1.523e-02, 6.478e-03, 2.315e-02, 1.187e-01, 3.161e-03, 1.209e-01, 4.878e-02, 2.207e-02, 3.119e-02, 4.699e-02, -4.015e-02, 1.090e-01, -2.879e-03), r0);
	r1 = MulAdd(s1_2_2, M4(-5.550e-02, 5.757e-02, -4.030e-02, 2.643e-02, -1.467e-02, -1.424e-02, 7.836e-02, -3.287e-03, 4.053e-02, -2.723e-01, 5.134e-01, -3.544e-02, -3.992e-02, -1.719e-01, 1.076e-01, -1.023e-01), r1);
	r2 = MulAdd(s1_2_2, M4(-1.935e-02, 1.187e-02, -3.824e-02, -1.930e-02, 1.031e-02, 8.747e-03, -4.062e-02, -3.402e-02, 1.437e-01, 7.662e-02, -2.975e-01, 3.014e-01, -1.324e-01, -1.409e-02, -6.537e-02, 9.833e-03), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(1.049e-02, -1.335e-01, -1.297e-01, -4.511e-02, 7.642e-02, 1.458e-01, 4.228e-01, 3.283e-02, 4.311e-02, 1.344e-01, -6.650e-01, 1.524e-01, 1.049e-02, -2.780e-01, 1.578e-01, 3.780e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-4.945e-02, 6.953e-02, 4.480e-01, 1.076e-01, -4.088e-02, -1.364e-01, -6.981e-02, -1.321e-02, 9.510e-02, 6.362e-01, 9.995e-02, 1.434e-01, 6.378e-03, 5.274e-02, 1.538e-01, 5.294e-02), r1);
	r2 = MulAdd(s0_0_0, M4(1.658e-01, -8.462e-02, -9.172e-02, 3.362e-02, -3.624e-02, 1.140e-01, -3.336e-02, 2.142e-01, -1.770e-02, 9.359e-01, -4.619e-01, -4.094e-01, 3.761e-02, -5.367e-02, 6.568e-02, 2.547e-02), r2);
	r0 = MulAdd(s0_0_1, M4(-3.524e-01, -7.459e-01, 4.531e-01, -1.294e-01, 6.494e-02, -7.296e-02, 1.826e-01, -8.026e-02, 1.385e-01, -9.923e-01, -3.027e-02, -7.091e-04, 4.135e-02, -1.150e-01, -3.314e-01, -3.982e-03), r0);
	r1 = MulAdd(s0_0_1, M4(9.941e-02, 7.051e-02, 7.081e-01, 3.595e-01, 8.379e-03, -4.014e-01, -8.898e-03, 1.091e-01, 1.924e-01, 1.126e-01, -4.621e-01, 1.577e-02, -1.730e-02, 1.571e-01, 1.148e-01, -7.555e-02), r1);
	r2 = MulAdd(s0_0_1, M4(2.943e-01, -6.791e-02, -3.460e-01, 1.828e-01, 1.267e-01, -7.782e-02, 2.735e-01, 5.548e-02, 1.449e-01, -6.915e-01, -6.504e-01, 1.029e-01, 1.675e-01, 2.399e-02, 6.291e-02, 2.583e-01), r2);
	r0 = MulAdd(s0_0_2, M4(1.067e-02, 8.607e-02, 1.720e-01, -3.334e-02, 3.202e-02, 2.845e-02, 1.829e-01, -2.759e-02, 1.440e-01, -6.175e-01, 2.417e-01, 7.253e-02, 4.019e-02, -4.140e-02, -4.056e-02, 1.281e-02), r0);
	r1 = MulAdd(s0_0_2, M4(-8.814e-02, -5.732e-01, -5.093e-01, 3.987e-02, -1.891e-03, 5.225e-02, -1.382e-02, -7.501e-02, 6.950e-02, 3.299e-01, 2.946e-01, 7.459e-02, -5.370e-03, 1.332e-01, 1.013e-01, -5.744e-02), r1);
	r2 = MulAdd(s0_0_2, M4(1.155e-01, 4.309e-02, -9.632e-02, 1.689e-01, 5.555e-03, 6.416e-02, 5.285e-02, 1.392e-01, -9.668e-02, -1.499e-01, -1.492e-01, 5.672e-02, 6.880e-02, 4.594e-02, -5.698e-02, 1.188e-01), r2);
	r0 = MulAdd(s0_1_0, M4(-4.763e-02, 1.066e-01, -5.677e-02, 3.206e-02, -3.233e-03, -1.175e-01, 1.398e-01, 1.013e-01, 6.491e-02, 2.819e-01, -4.151e-01, -1.479e-01, 6.811e-02, -1.406e-02, 8.030e-01, 2.339e-02), r0);
	r1 = MulAdd(s0_1_0, M4(5.653e-02, -1.158e-01, 3.994e-02, -2.781e-02, -4.152e-02, 6.296e-01, -1.744e+00, 3.799e-01, -2.964e-01, -7.482e-02, -5.059e-01, -7.172e-02, 5.907e-02, 4.077e-01, 3.215e-01, 2.188e-01), r1);
	r2 = MulAdd(s0_1_0, M4(1.577e-01, -5.115e-02, 8.960e-02, 1.598e-01, -3.831e-01, 3.237e-02, -1.098e-01, 1.558e-01, -8.850e-02, 3.037e-01, 5.178e-01, -3.877e-01, -8.910e-02, -1.753e-01, -3.563e-01, -2.639e-01), r2);
	r0 = MulAdd(s0_1_1, M4(3.252e-02, 2.205e-01, 1.770e-01, -4.160e-02, -9.119e-03, 1.145e-01, -2.775e-01, -7.459e-02, 1.816e-01, 1.913e-01, 4.988e-01, 5.357e-01, 3.685e-01, -4.095e-01, -2.006e+00, -3.086e-01), r0);
	r1 = MulAdd(s0_1_1, M4(2.626e-02, -4.013e-01, -4.012e-01, 4.832e-02, 4.896e-02, 3.947e-01, 7.472e-01, -6.949e-02, 2.622e-02, -1.703e-01, 4.125e-01, -6.872e-02, 2.196e-01, -1.812e+00, -9.043e-01, -2.901e-01), r1);
	r2 = MulAdd(s0_1_1, M4(4.824e-02, -9.106e-02, 8.551e-02, 4.870e-02, 4.366e-02, -2.477e-02, -3.135e-01, -4.450e-02, 7.160e-02, -1.774e-01, 1.321e-01, 4.167e-01, 4.435e-01, 7.851e-02, 2.368e-01, 1.116e-01), r2);
	r0 = MulAdd(s0_1_2, M4(-1.846e-02, 1.656e-01, 1.595e-01, 7.564e-02, 3.918e-02, 6.479e-03, 4.409e-02, -2.073e-02, -1.248e-01, 3.271e-01, -3.435e-02, -9.585e-02, -3.856e-02, -3.480e-01, -5.498e-03, 4.041e-02), r0);
	r1 = MulAdd(s0_1_2, M4(-1.070e-01, -2.319e-01, 2.577e-01, 9.888e-02, 1.467e-02, -4.102e-02, -5.286e-02, -3.510e-04, -6.217e-03, -4.660e-02, 3.231e-01, -1.836e-02, -7.904e-02, 5.138e-01, 3.097e-01, -9.498e-02), r1);
	r2 = MulAdd(s0_1_2, M4(2.055e-02, -1.885e-02, 1.421e-01, 7.128e-02, -5.062e-02, -5.073e-04, 8.391e-02, -3.746e-04, 6.806e-02, -7.608e-03, 1.445e-01, 8.995e-02, -1.348e-01, 6.406e-02, -3.115e-01, 3.193e-01), r2);
	r0 = MulAdd(s0_2_0, M4(-8.266e-03, -1.002e-02, 9.359e-02, -8.208e-03, 5.158e-02, 7.479e-02, 1.058e-01, 3.816e-03, 7.466e-04, 2.459e-01, 1.734e-01, -4.535e-02, 3.925e-02, 4.310e-02, 9.980e-02, 2.069e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-1.841e-03, -7.327e-03, 6.179e-02, 2.707e-02, 1.806e-02, 6.385e-03, 1.243e-02, 1.325e-01, 2.972e-02, -2.149e-01, -4.066e-01, -7.819e-02, 7.887e-02, 5.567e-01, -8.194e-02, 1.024e-01), r1);
	r2 = MulAdd(s0_2_0, M4(2.281e-03, 6.580e-03, 3.127e-02, -2.496e-02, 1.714e-01, -6.991e-03, 1.204e-01, 1.104e-01, -2.670e-01, -4.913e-03, -4.114e-03, -7.852e-02, 9.898e-02, 4.270e-02, -1.438e-02, 4.004e-01), r2);
	r0 = MulAdd(s0_2_1, M4(3.980e-03, -1.561e-02, -5.038e-02, -4.837e-03, 1.541e-02, 1.008e-01, 4.123e-02, -1.364e-02, -3.167e-01, -9.263e-02, 2.300e-01, -6.978e-03, 3.604e-01, 2.132e-01, -1.353e-01, -1.899e-01), r0);
	r1 = MulAdd(s0_2_1, M4(-1.031e-02, 2.997e-02, 6.316e-03, 1.106e-02, -1.891e-02, -2.242e-02, 1.562e-02, 8.090e-03, 4.145e-02, -2.339e-01, 3.604e-01, -1.351e-01, 2.939e-01, -9.567e-01, -4.006e+00, 6.434e-02), r1);
	r2 = MulAdd(s0_2_1, M4(5.955e-03, 5.431e-04, 2.424e-02, 3.296e-02, -6.513e-02, 9.009e-03, -1.053e-01, 1.362e-02, -5.901e-02, -9.398e-02, 3.151e-01, 2.313e-01, 3.028e-01, -5.867e-02, 1.316e-01, -1.064e+00), r2);
	r0 = MulAdd(s0_2_2, M4(1.664e-03, 4.885e-02, 6.467e-03, 2.416e-03, 2.205e-02, -3.710e-02, -2.781e-02, 1.144e-02, -8.296e-02, 5.449e-01, -5.948e-02, 6.001e-03, 1.095e-01, -5.896e-02, 2.739e-01, 4.982e-02), r0);
	r1 = MulAdd(s0_2_2, M4(2.118e-02, -2.073e-02, -2.716e-02, -4.218e-02, -2.640e-03, -1.735e-02, -1.608e-02, 1.907e-02, -1.903e-01, -3.902e-01, 4.919e-02, 7.691e-02, 1.723e-01, -1.613e-01, -1.460e-01, 3.893e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-8.745e-03, 1.296e-02, -4.421e-02, -1.450e-03, -1.843e-02, -5.913e-06, 1.192e-02, -5.708e-02, 3.204e-01, -1.079e-01, 1.221e-01, 2.175e-01, -9.344e-02, 3.161e-02, 2.237e-01, -1.026e-01), r2);
	r0 = max(r0, 0.0);
	T3[gxy] = r0;
	r1 = max(r1, 0.0);
	T4[gxy] = r1;
	r2 = max(r2, 0.0);
	T5[gxy] = r2;
}

//!PASS 3
//!DESC conv2 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T3, T4, T5
//!OUT T0, T1, T2

#define L0(x, y) V4(O(T3, x, y))
#define L1(x, y) V4(O(T4, x, y))
#define L2(x, y) V4(O(T5, x, y))

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(3.424e-01, -1.582e-01, -6.546e-02, -6.168e-04, -1.939e-01, 2.581e-01, 2.072e-02, 7.369e-02, 2.066e-01, -1.782e-01, 8.261e-02, 8.520e-03, -4.701e-02, -5.072e-02, 1.868e-01, 7.241e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-6.540e-02, -1.407e-01, -5.914e-02, 1.370e-01, -7.487e-03, 1.734e-01, 3.866e-02, -7.816e-02, 1.286e-02, 2.867e-01, -2.130e-03, -3.328e-01, -2.020e-03, 8.844e-02, -7.585e-02, 1.378e-02), r1);
	r2 = MulAdd(s0_0_0, M4(7.944e-02, 8.578e-03, 1.318e-02, -6.486e-02, -5.134e-02, -1.568e-01, 3.603e-02, -1.226e-03, 1.029e-01, -3.026e-02, 1.478e-02, -1.552e-02, 3.369e-01, 1.223e-01, 1.333e-02, -3.220e-02), r2);
	r0 = MulAdd(s0_0_1, M4(-5.594e-02, 1.958e-01, 4.475e-01, 2.528e-01, -1.088e-01, 9.121e-02, -2.478e-01, -3.702e-03, -6.302e-02, -3.915e-01, -1.942e-01, 5.244e-02, 9.604e-01, -4.170e-01, 6.322e-01, -3.612e-02), r0);
	r1 = MulAdd(s0_0_1, M4(-6.959e-02, -4.734e-01, 7.459e-02, -2.052e-02, 3.991e-02, 2.206e-01, -6.098e-02, 2.447e-01, 4.352e-02, 2.168e-01, -3.148e-02, 1.754e-02, 1.714e-01, 6.895e-01, -1.596e-02, 5.220e-01), r1);
	r2 = MulAdd(s0_0_1, M4(-2.646e-01, -4.061e-02, -1.407e-02, 3.267e-02, 1.610e-02, 9.203e-02, -1.259e-01, 1.086e-02, -4.389e-02, 1.241e-01, -4.555e-02, 4.090e-02, -2.485e-01, 2.881e-02, -2.093e-01, 1.290e-01), r2);
	r0 = MulAdd(s0_0_2, M4(-1.221e-01, 1.343e-01, 1.756e-02, 4.272e-02, -1.000e-02, -6.718e-02, -2.495e-01, -1.367e-02, 1.086e-01, -3.007e-01, -9.456e-03, 2.766e-02, 1.830e-01, -8.254e-02, -6.654e-01, -2.775e-01), r0);
	r1 = MulAdd(s0_0_2, M4(2.765e-02, 5.702e-02, -1.251e-02, -1.180e-03, 8.983e-03, -3.260e-02, -1.310e-02, -8.851e-02, 5.279e-03, 5.018e-02, 7.017e-03, 1.063e-02, 1.985e-02, -2.096e-04, -1.460e-01, -1.961e-01), r1);
	r2 = MulAdd(s0_0_2, M4(4.184e-02, -3.108e-02, -7.327e-02, 2.286e-02, -1.713e-01, -1.317e-01, -2.048e-02, -3.717e-02, -2.856e-02, -3.651e-02, 4.275e-02, -7.003e-03, -5.321e-02, -1.217e-01, -1.480e-01, -1.183e-01), r2);
	r0 = MulAdd(s0_1_0, M4(2.179e-02, 1.315e-01, 1.091e-01, 3.489e-02, 1.135e-01, 1.456e-01, -8.571e-02, 2.148e-01, -6.439e-01, -1.469e-01, -3.470e-03, 1.237e-01, 1.899e-01, -1.441e-02, -2.881e-01, -2.385e-01), r0);
	r1 = MulAdd(s0_1_0, M4(3.991e-02, 5.187e-01, 1.634e-01, -7.621e-01, 4.860e-03, -5.186e-01, -4.390e-02, 3.916e-01, -2.656e-02, 2.354e-01, 1.743e-02, -4.420e-02, 3.155e-02, -1.099e-01, 1.887e-01, 2.126e-01), r1);
	r2 = MulAdd(s0_1_0, M4(-1.800e-01, -1.850e-01, -9.838e-02, 1.124e-01, -9.580e-02, 1.004e-01, 5.531e-02, -8.521e-02, -8.652e-02, -1.008e-01, 2.892e-02, 6.391e-04, -2.675e-01, 8.596e-02, 9.709e-02, 5.314e-02), r2);
	r0 = MulAdd(s0_1_1, M4(2.777e-01, 1.440e-01, -6.385e-01, -1.450e+00, -1.948e-01, 2.267e-01, 2.585e-01, 1.445e-02, -1.438e-01, 1.994e-01, 5.586e-02, 2.287e-01, -4.307e-01, 3.486e-01, -2.227e-01, 6.488e-01), r0);
	r1 = MulAdd(s0_1_1, M4(4.186e-01, -2.523e-01, -8.654e-02, 2.292e-02, -1.880e-01, 1.430e-01, 9.041e-02, -7.149e-02, -7.582e-02, -1.469e-01, -6.140e-03, -3.484e-01, -4.464e-01, -2.844e-01, -1.972e-01, -5.546e-01), r1);
	r2 = MulAdd(s0_1_1, M4(2.144e-02, 3.218e-01, 7.520e-01, 3.471e-01, -9.673e-02, -5.279e-02, 4.198e-03, 4.155e-02, 9.928e-02, 3.935e-03, 7.468e-02, 5.112e-02, 8.057e-01, 2.750e-01, 4.573e-01, 4.273e-01), r2);
	r0 = MulAdd(s0_1_2, M4(3.181e-02, 1.709e-02, 1.704e-01, 1.362e-01, 2.060e-01, -5.583e-02, -1.044e-01, -2.850e-02, -3.306e-01, -7.003e-02, 5.325e-02, 1.275e-01, -2.124e-01, 3.360e-01, 4.939e-01, -2.286e-01), r0);
	r1 = MulAdd(s0_1_2, M4(-5.481e-02, -1.274e-02, 1.571e-01, -2.584e-03, 3.455e-02, -7.897e-02, -9.256e-02, 1.617e-01, -4.284e-02, 5.980e-03, 3.708e-02, -1.362e-01, 2.144e-01, 1.961e-03, -3.237e-02, 2.662e-01), r1);
	r2 = MulAdd(s0_1_2, M4(-6.732e-03, -2.786e-01, -1.920e-02, 1.169e-01, -1.079e-01, 6.667e-02, -7.789e-02, -1.140e-01, 6.323e-02, 5.802e-02, 2.899e-02, 3.782e-02, -1.523e-01, -5.014e-01, -3.662e-01, -2.144e-01), r2);
	r0 = MulAdd(s0_2_0, M4(-1.399e-01, -7.387e-02, 3.718e-02, 3.655e-02, 2.862e-01, 2.917e-02, -4.900e-02, 1.343e-01, -3.545e-01, -1.507e-01, -1.065e-02, 1.204e-01, -3.091e-01, 4.089e-02, 8.930e-02, 1.109e-01), r0);
	r1 = MulAdd(s0_2_0, M4(1.714e-02, -9.474e-03, 1.035e-02, -3.036e-02, -4.332e-02, 7.243e-02, -7.869e-02, -4.499e-02, 2.607e-02, 2.152e-01, 2.390e-03, 8.116e-02, -3.033e-02, -6.017e-02, -8.380e-02, -2.836e-01), r1);
	r2 = MulAdd(s0_2_0, M4(-1.020e-02, 2.711e-02, -1.884e-02, 1.911e-02, -1.132e-01, -1.352e-01, 2.838e-02, -2.843e-03, -3.526e-02, 9.918e-03, 3.846e-02, -4.973e-02, -1.550e-02, -2.626e-02, -2.655e-02, -3.604e-02), r2);
	r0 = MulAdd(s0_2_1, M4(1.451e-02, -3.809e-02, 3.990e-02, -1.831e-01, 4.474e-02, 4.242e-05, -3.386e-02, 3.569e-01, 5.130e-02, -1.371e-01, -1.584e-02, 2.103e-01, -4.593e-01, -1.627e-02, -3.550e-01, 3.969e-02), r0);
	r1 = MulAdd(s0_2_1, M4(1.261e-02, -1.549e-02, 7.325e-02, -5.494e-03, -8.480e-02, 1.868e-02, 2.876e-02, 5.045e-02, 8.408e-02, 3.500e-02, -5.359e-02, -4.843e-01, 2.199e-02, -1.043e-01, 2.972e-01, 5.220e-02), r1);
	r2 = MulAdd(s0_2_1, M4(-1.231e-01, -6.732e-02, 3.430e-02, -9.107e-02, -1.767e-01, -8.722e-02, 2.471e-02, -5.755e-02, 3.489e-02, 9.203e-02, 2.265e-02, 2.308e-02, -4.872e-01, 1.475e-01, 1.755e-01, -1.433e-01), r2);
	r0 = MulAdd(s0_2_2, M4(2.863e-02, 8.084e-02, -3.705e-02, 7.152e-03, 1.688e-01, -2.701e-02, 1.544e-02, 1.577e-01, -9.706e-02, 1.224e-02, -1.930e-02, 1.080e-01, 2.449e-02, -2.085e-01, 9.539e-02, 6.446e-03), r0);
	r1 = MulAdd(s0_2_2, M4(1.278e-02, 2.423e-02, -6.449e-02, 1.108e-02, 1.826e-02, 5.656e-02, -6.023e-05, 5.363e-02, 2.012e-02, 1.050e-02, 6.042e-03, -1.679e-01, -8.546e-03, -2.296e-01, 3.288e-02, 6.906e-02), r1);
	r2 = MulAdd(s0_2_2, M4(8.388e-02, -2.117e-02, -8.629e-03, 5.410e-02, -1.548e-01, -1.168e-03, -2.935e-02, -7.434e-02, 1.562e-02, -5.832e-03, 5.447e-02, 5.615e-02, 1.025e-01, -2.264e-02, -7.165e-03, -9.662e-02), r2);
	r0 = MulAdd(s1_0_0, M4(4.393e-02, -3.295e-02, -4.448e-02, -2.675e-01, -1.621e-01, 1.144e-02, 2.604e-02, 3.273e-02, 1.915e-01, -1.818e-01, 1.968e-02, -1.508e-02, -1.921e-01, -1.484e-01, -3.336e-02, -4.131e-02), r0);
	r1 = MulAdd(s1_0_0, M4(2.569e-01, -2.582e+00, 1.893e-02, 3.325e-01, -8.104e-04, 8.359e-02, -1.199e-02, -1.625e-01, 3.504e-02, 2.130e-01, 2.373e-02, -1.410e-01, -1.670e-02, -5.005e-01, -8.379e-02, -4.381e-02), r1);
	r2 = MulAdd(s1_0_0, M4(2.399e-01, -5.492e-02, 7.391e-02, 4.285e-02, -5.533e-02, -1.231e-01, -1.896e-02, -3.396e-03, 4.602e-02, -3.153e-02, 9.462e-02, -3.125e-02, -2.166e-04, 1.351e-01, -1.085e-02, -5.224e-02), r2);
	r0 = MulAdd(s1_0_1, M4(3.487e-01, -7.019e-02, -1.621e-01, -3.816e-01, -8.034e-02, -1.074e-01, -1.206e-02, -1.835e-02, -1.821e-01, -3.839e-01, -3.959e-01, 8.855e-02, -2.417e-01, -2.423e-02, -4.326e-01, 1.238e-01), r0);
	r1 = MulAdd(s1_0_1, M4(4.290e-03, 1.489e-01, 9.698e-02, -5.690e-01, 4.466e-03, -3.371e-02, 1.130e-02, -1.867e-01, 6.432e-02, 1.257e-01, -8.754e-02, -2.624e-02, -6.373e-02, -4.008e-01, 1.415e-02, 9.504e-03), r1);
	r2 = MulAdd(s1_0_1, M4(1.057e-02, 1.400e-01, 4.181e-01, 2.088e-02, -5.234e-02, -1.978e-02, -1.528e-02, -1.689e-03, 1.184e-01, 1.135e-01, -3.136e-02, 5.561e-02, -2.182e-01, -3.383e-01, -8.270e-02, 2.507e-03), r2);
	r0 = MulAdd(s1_0_2, M4(2.802e-01, 7.662e-02, 5.710e-02, -2.369e-01, -5.326e-02, 8.114e-02, -8.379e-03, 1.163e-01, 2.007e-01, -9.951e-02, -2.607e-01, 6.310e-02, -1.718e-03, 7.582e-02, 1.765e-01, 2.865e-02), r0);
	r1 = MulAdd(s1_0_2, M4(-5.031e-03, 1.033e-01, -5.366e-03, 9.244e-02, 2.857e-03, -4.543e-02, -5.357e-03, -6.026e-02, 7.987e-02, -6.349e-04, 1.721e-03, -1.385e-01, 2.753e-02, 1.200e-01, -1.163e-02, 1.723e-01), r1);
	r2 = MulAdd(s1_0_2, M4(1.464e-01, 1.029e-01, 1.336e-02, 3.836e-02, 1.447e-02, -4.724e-02, 1.229e-02, 5.121e-03, -1.277e-02, -5.675e-02, 4.433e-02, -3.360e-03, -1.468e-01, 7.835e-02, -1.192e-02, -1.411e-02), r2);
	r0 = MulAdd(s1_1_0, M4(-1.155e+00, -4.480e-02, 9.309e-02, -4.840e-01, -2.822e-01, 2.839e-02, -2.803e-02, 1.155e-01, -5.551e-01, -2.681e-01, -9.137e-03, 5.448e-02, -1.200e+00, -3.168e-01, -4.549e-02, 4.248e-02), r0);
	r1 = MulAdd(s1_1_0, M4(6.270e-01, -1.010e-01, 2.300e-01, -3.078e-01, -8.238e-02, 2.719e-01, -5.990e-02, -7.224e-02, 6.262e-02, 3.181e-01, -6.099e-02, -5.150e-02, -2.629e-02, -2.842e-01, 5.434e-03, 4.125e-01), r1);
	r2 = MulAdd(s1_1_0, M4(3.415e-01, 1.793e-01, -1.568e-02, 3.302e-01, -8.442e-02, -3.291e-02, 2.154e-03, 1.605e-02, 6.640e-02, 2.173e-01, -2.463e-02, 6.188e-02, -2.254e-01, -7.148e-02, -3.065e-02, -1.073e-01), r2);
	r0 = MulAdd(s1_1_1, M4(5.716e-02, -4.461e-01, 2.311e-01, 7.908e-01, -2.363e-01, -5.926e-02, 1.901e-03, 2.347e-01, -5.171e-01, -1.383e-01, -2.228e-01, -6.570e-01, -7.284e-01, -5.200e-01, 2.967e-02, 1.179e-01), r0);
	r1 = MulAdd(s1_1_1, M4(-1.708e-01, 1.357e-01, -5.094e-02, 1.196e-01, -1.033e-01, -1.548e-01, 1.367e-02, -5.055e-02, 7.184e-02, 1.133e-01, -1.069e-01, -6.623e-01, -1.166e-01, -1.765e-01, 1.748e-01, 2.983e-01), r1);
	r2 = MulAdd(s1_1_1, M4(-2.811e-02, -1.396e-01, -5.402e-02, -1.027e-01, 4.418e-02, -3.840e-02, -2.957e-03, -9.759e-02, 5.618e-01, 2.555e-01, 5.290e-03, 1.648e-01, -3.711e-01, 1.813e-01, -5.165e-02, -8.706e-03), r2);
	r0 = MulAdd(s1_1_2, M4(2.942e-02, 1.904e-01, 2.664e-02, -5.421e-04, -5.558e-02, 1.484e-01, 6.836e-02, 2.482e-01, 1.626e-01, -1.076e-01, 1.891e-02, 2.238e-02, 1.782e-01, -1.802e-01, -1.743e-01, -5.355e-02), r0);
	r1 = MulAdd(s1_1_2, M4(4.479e-02, 2.373e-02, -1.670e-02, -2.848e-01, 2.063e-02, -1.170e-01, -3.136e-03, -2.457e-01, 6.175e-02, -1.254e-02, 1.866e-02, -1.387e-01, -7.886e-02, 3.014e-02, 1.921e-02, 7.901e-03), r1);
	r2 = MulAdd(s1_1_2, M4(5.406e-02, -9.129e-02, 1.495e-02, -2.949e-02, 1.095e-01, -4.392e-02, -2.196e-02, 2.408e-02, -3.649e-02, 1.296e-01, 9.234e-02, -1.345e-01, -5.041e-02, 6.324e-02, -1.775e-02, -2.158e-03), r2);
	r0 = MulAdd(s1_2_0, M4(8.679e-02, 5.701e-02, -6.133e-02, -9.642e-02, -3.772e-01, 6.917e-02, -8.084e-02, 1.610e-01, -1.811e-01, -1.501e-01, 1.724e-01, 1.036e-02, 5.775e-02, -2.087e-02, 4.915e-02, -1.170e-01), r0);
	r1 = MulAdd(s1_2_0, M4(-1.613e-03, -4.104e-03, -7.692e-02, -7.496e-02, -3.253e-02, -1.734e-02, -7.336e-02, -1.748e-01, 6.113e-02, 9.483e-02, 4.794e-02, 5.502e-01, 7.052e-02, 3.529e-02, 4.266e-02, 1.345e-01), r1);
	r2 = MulAdd(s1_2_0, M4(-7.282e-03, 9.590e-02, -7.558e-03, 1.223e-01, -7.738e-02, -6.485e-02, 3.983e-03, -6.360e-02, -2.617e-02, -8.540e-03, 6.395e-02, -3.045e-02, 4.676e-03, 3.692e-02, 4.479e-02, 5.626e-02), r2);
	r0 = MulAdd(s1_2_1, M4(-1.043e-02, 7.251e-02, 2.617e-02, 1.771e-01, -3.045e-01, 2.153e-01, 9.522e-03, 1.842e-01, 1.631e-01, -8.144e-02, 2.290e-01, 1.197e-02, 1.878e-01, -8.502e-02, -4.346e-02, -6.858e-02), r0);
	r1 = MulAdd(s1_2_1, M4(-9.532e-02, 1.544e-01, 1.752e-02, -2.860e-02, -1.514e-02, -8.833e-02, -2.373e-02, -2.523e-01, 4.630e-02, -2.007e-02, 5.019e-02, 1.888e-01, -1.252e-02, 5.758e-02, 1.080e-02, 5.630e-04), r1);
	r2 = MulAdd(s1_2_1, M4(1.568e-01, -6.033e-02, -7.233e-03, -6.183e-04, 3.094e-02, 3.673e-02, -8.772e-04, -3.506e-02, 1.865e-03, -8.704e-02, 5.076e-02, 9.331e-03, -1.258e-02, -7.546e-02, -2.753e-02, 1.451e-01), r2);
	r0 = MulAdd(s1_2_2, M4(-3.943e-02, -7.739e-02, 1.141e-02, 3.386e-02, 2.409e-03, -1.048e-03, -1.029e-02, 2.017e-01, 9.139e-02, 1.907e-02, 7.202e-02, -5.458e-02, 1.323e-01, 4.419e-02, -3.845e-02, 2.763e-02), r0);
	r1 = MulAdd(s1_2_2, M4(4.840e-02, -2.158e-02, -2.862e-02, -2.257e-02, 2.203e-02, -9.015e-02, -3.538e-02, -1.152e-01, 4.691e-02, 6.075e-02, 6.068e-02, -9.781e-02, -1.055e-02, 8.470e-02, -4.911e-02, 1.405e-01), r1);
	r2 = MulAdd(s1_2_2, M4(-1.508e-02, 1.151e-02, -2.563e-03, -6.476e-02, 9.839e-03, -7.904e-02, 1.169e-02, 9.615e-03, 1.173e-01, 3.705e-02, -1.100e-02, -1.275e-02, 1.690e-02, 5.741e-02, -3.376e-02, 3.410e-02), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.073e-01, -2.885e-03, 7.948e-03, 1.310e-01, 1.655e-01, -3.284e-02, -1.090e-02, 1.014e-02, -1.347e-01, -2.421e-02, -1.323e-01, -3.403e-02, -1.109e-01, -2.861e-02, 9.110e-02, -9.107e-02), r0);
	r1 = MulAdd(s0_0_0, M4(1.079e-02, -1.731e-01, 7.495e-02, 2.995e-02, 4.917e-03, 1.123e-02, 1.015e-02, -9.103e-02, -3.501e-02, -8.630e-02, -2.111e-02, 2.900e-01, 2.264e-02, 4.362e-01, -6.079e-02, 5.780e-02), r1);
	r2 = MulAdd(s0_0_0, M4(5.937e-02, 1.048e-01, -3.052e-02, 5.846e-03, -3.546e-02, -1.096e-02, 6.774e-03, 8.801e-03, -1.333e-01, 1.456e-01, 2.041e-02, 3.865e-02, -5.588e-02, -2.757e-02, -1.255e-02, 4.885e-03), r2);
	r0 = MulAdd(s0_0_1, M4(-2.114e-02, 1.890e-01, -6.080e-02, 1.601e-01, 9.935e-02, -4.446e-02, -1.694e-01, 1.226e-02, -3.223e-02, -1.327e-01, -2.163e-01, -5.216e-02, 3.068e-02, -2.806e-01, -2.214e-01, -5.581e-02), r0);
	r1 = MulAdd(s0_0_1, M4(1.159e-02, 6.686e-02, -1.203e-02, 2.586e-02, -2.453e-02, 1.501e-01, 1.428e-02, -1.431e-01, -1.293e-02, 1.005e-01, -5.598e-03, 4.522e-01, 2.570e-02, -1.836e-01, -9.186e-03, 1.243e-01), r1);
	r2 = MulAdd(s0_0_1, M4(6.832e-02, 1.109e-01, 2.888e-02, -7.086e-03, 6.509e-02, -1.226e-02, 4.822e-02, -2.200e-02, -3.422e-01, 8.815e-02, -4.828e-02, 1.967e-02, 4.237e-02, -1.083e-01, 4.111e-02, 1.588e-03), r2);
	r0 = MulAdd(s0_0_2, M4(-6.349e-02, 5.097e-02, 4.853e-02, -6.523e-02, 1.366e-01, 3.237e-02, 1.391e-01, 4.145e-02, -1.073e-01, -1.427e-01, -3.020e-01, 1.075e-02, 9.771e-02, 3.156e-03, -7.502e-02, -6.412e-02), r0);
	r1 = MulAdd(s0_0_2, M4(2.023e-02, -1.925e-02, 2.873e-02, -1.734e-01, -3.196e-02, 1.403e-01, 2.582e-02, -3.704e-02, -2.347e-02, -1.496e-02, -4.016e-02, 2.252e-01, -5.528e-02, -2.551e-02, -6.763e-03, 2.616e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-1.654e-03, -7.653e-02, 1.654e-02, 6.765e-03, -4.989e-02, 9.351e-02, 3.588e-02, 3.366e-02, -2.376e-01, -3.120e-02, -5.345e-02, -3.772e-02, -8.353e-02, -9.581e-03, -1.173e-02, -2.972e-02), r2);
	r0 = MulAdd(s0_1_0, M4(5.712e-01, 4.305e-02, 3.184e-02, -5.558e-02, -3.975e-02, 3.606e-02, 1.344e-01, -3.962e-02, 2.982e-01, -6.053e-04, -1.589e-01, 9.418e-02, -4.619e-01, -1.875e-01, 3.955e-02, 5.997e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-6.087e-02, -2.704e-01, 4.492e-02, 3.213e-01, 8.188e-02, -8.424e-01, 3.334e-02, -1.295e-01, 1.910e-02, -1.045e-01, 1.563e-02, 2.160e-01, -6.658e-02, 3.979e-01, -1.040e-01, 2.705e-01), r1);
	r2 = MulAdd(s0_1_0, M4(-3.647e-02, 2.265e-02, 7.663e-03, -1.333e-02, 1.210e-01, -3.067e-02, -3.087e-03, 2.872e-02, -1.376e-02, 1.022e-01, 5.203e-02, 1.622e-02, -7.550e-02, -4.181e-02, -7.325e-02, -6.054e-02), r2);
	r0 = MulAdd(s0_1_1, M4(8.488e-01, 1.048e-01, 4.430e-01, 1.287e-01, 5.247e-01, -2.116e-01, 2.249e-01, -6.028e-02, -1.546e-02, -2.114e-01, -1.140e-01, -4.995e-02, -6.261e-01, -8.701e-02, 1.658e-02, 4.500e-01), r0);
	r1 = MulAdd(s0_1_1, M4(2.563e-02, 1.447e-01, 1.530e-02, 1.216e-01, -8.168e-03, -2.397e-01, 1.693e-01, -5.370e-01, -3.994e-02, 6.204e-02, -2.108e-02, 1.067e-01, -8.619e-02, 2.862e-01, -2.427e-01, 1.871e-01), r1);
	r2 = MulAdd(s0_1_1, M4(1.221e-01, -2.916e-01, -1.003e-01, -4.690e-02, -1.488e-02, 3.652e-03, -1.911e-01, 2.983e-02, -2.231e-01, 9.535e-02, -2.069e-02, 1.122e-01, 2.362e-01, 3.100e-01, 3.006e-02, -1.024e-01), r2);
	r0 = MulAdd(s0_1_2, M4(1.916e-02, 1.040e-01, 5.935e-02, 8.190e-02, 2.416e-01, 3.663e-03, 2.324e-02, -4.383e-01, -4.091e-02, -9.447e-02, -1.357e-01, 9.980e-02, 1.471e-01, -3.115e-02, -1.516e-01, -4.576e-02), r0);
	r1 = MulAdd(s0_1_2, M4(5.641e-02, -1.140e-01, -1.471e-02, -3.351e-01, 2.183e-02, 1.148e-01, -3.533e-02, -2.080e-01, -3.132e-02, 6.610e-02, -3.884e-03, 4.195e-01, 2.519e-02, 9.057e-02, -1.061e-02, 9.321e-02), r1);
	r2 = MulAdd(s0_1_2, M4(8.883e-02, -6.591e-02, -1.204e-02, -7.192e-02, 1.578e-01, 1.506e-01, -3.464e-02, 1.097e-01, -5.575e-01, -2.792e-02, -8.415e-03, -3.757e-02, -2.318e-01, 4.881e-02, 2.612e-02, -7.005e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-1.831e-01, 3.585e-02, 2.233e-02, 6.002e-02, 2.055e-01, -7.205e-02, 3.852e-03, -5.522e-02, -8.859e-02, -7.539e-02, -8.218e-02, 2.046e-02, 3.419e-01, -6.625e-02, -2.028e-02, -1.508e-02), r0);
	r1 = MulAdd(s0_2_0, M4(4.121e-02, 2.850e-01, 2.148e-03, -1.046e-01, 5.802e-02, -2.377e-01, 1.353e-02, 3.650e-02, -2.001e-02, 1.320e-01, -7.943e-03, -3.328e-03, -1.779e-02, 3.090e-02, 5.641e-02, 5.749e-02), r1);
	r2 = MulAdd(s0_2_0, M4(1.268e-01, -9.738e-02, 4.717e-02, -3.273e-02, 1.548e-01, 1.124e-01, 3.914e-02, 4.995e-02, -2.959e-01, -5.985e-03, -1.985e-02, 4.931e-02, -2.142e-01, -3.858e-02, -5.355e-02, -5.923e-02), r2);
	r0 = MulAdd(s0_2_1, M4(1.941e-02, 1.313e-01, -3.649e-02, 3.128e-02, -8.230e-01, 1.338e-02, 1.548e-01, 5.652e-02, -3.031e-01, 3.842e-02, -1.023e-01, 1.008e-01, 2.879e-01, -9.801e-02, -1.586e-02, -1.170e-01), r0);
	r1 = MulAdd(s0_2_1, M4(1.419e-02, -5.248e-02, 3.668e-02, -5.624e-02, 9.053e-02, -2.086e-02, 5.137e-01, 3.149e-01, -1.649e-02, 5.181e-02, 3.778e-02, 2.124e-01, -3.838e-02, 1.525e-01, -1.075e-03, 2.692e-01), r1);
	r2 = MulAdd(s0_2_1, M4(1.691e-01, 1.897e-02, 3.591e-02, 1.446e-02, 2.190e-01, -3.740e-01, 2.753e-02, 1.964e-01, -3.154e-01, 6.559e-02, 3.800e-02, 3.765e-02, 1.991e-02, -5.351e-03, -1.587e-01, -4.168e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-1.926e-01, 7.593e-02, 1.169e-02, 1.982e-02, -2.546e-01, 9.980e-02, 1.278e-01, -1.172e+00, -1.775e-01, 6.683e-02, -1.432e-01, -5.396e-02, 1.596e-01, 2.865e-02, -1.246e-01, -5.689e-02), r0);
	r1 = MulAdd(s0_2_2, M4(1.611e-02, -8.472e-02, 8.093e-04, 1.619e-02, 4.636e-03, -1.238e-01, 3.457e-02, -5.895e-01, -2.843e-02, -1.275e-01, 1.789e-02, 1.288e-01, 5.508e-02, 1.199e-01, -1.875e-02, 1.774e-01), r1);
	r2 = MulAdd(s0_2_2, M4(1.548e-01, -4.316e-02, -8.196e-03, -4.015e-03, 2.926e-01, 1.181e-01, 1.038e-01, 9.861e-02, -2.074e-01, 3.924e-02, 2.767e-02, -1.060e-02, -4.989e-02, 1.194e-01, 2.919e-02, -7.371e-02), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 4
//!DESC conv3 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1, T2
//!OUT T3, T4, T5

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-1.147e-03, -3.143e-02, 1.999e-01, -2.199e-02, -5.716e-02, -2.313e-02, -4.360e-02, 4.923e-04, -6.488e-03, 3.834e-02, 2.813e-02, 1.077e-02, -3.076e-02, 4.032e-02, 1.017e-01, 3.632e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-9.244e-02, -6.314e-02, 4.337e-02, 4.995e-02, -1.294e-01, -1.617e-02, 5.786e-02, -3.886e-03, 1.227e-03, 2.723e-02, 5.879e-02, 3.083e-02, 1.589e-02, -9.004e-02, 3.810e-02, 1.606e-02), r1);
	r2 = MulAdd(s0_0_0, M4(5.131e-03, -5.730e-03, 1.233e-02, -2.174e-02, -1.112e-01, -2.886e-02, -9.422e-03, -1.850e-02, 1.429e-02, -1.700e-02, -2.908e-02, -1.046e-02, -1.151e-01, 5.771e-02, 1.671e-01, 7.104e-02), r2);
	r0 = MulAdd(s0_0_1, M4(1.237e-01, -8.863e-02, 1.301e-01, -1.038e-02, 6.233e-02, -6.462e-02, 5.774e-04, 2.709e-02, 2.100e-02, 4.533e-02, 1.935e-02, -2.740e-02, -6.144e-03, -5.068e-02, 6.931e-02, 4.801e-02), r0);
	r1 = MulAdd(s0_0_1, M4(-3.669e-02, 8.264e-02, 1.100e-01, 7.740e-02, -2.588e-02, 4.328e-02, -1.581e-01, -2.277e-01, -1.660e-02, -4.844e-02, 9.342e-03, 1.578e-01, -1.481e-01, -2.769e-01, -3.135e-01, -7.017e-01), r1);
	r2 = MulAdd(s0_0_1, M4(1.168e-01, 4.617e-02, 2.075e-01, 7.056e-02, 3.399e-02, -2.130e-02, 1.207e-01, -9.114e-04, 8.850e-02, 1.922e-02, -6.396e-03, -1.863e-02, -6.756e-02, 6.373e-02, 2.490e-01, 1.899e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-1.238e-01, -4.603e-02, 8.335e-02, 1.714e-02, 3.356e-02, -2.488e-02, -6.843e-03, -3.544e-02, 9.339e-03, 3.138e-02, 4.410e-02, -1.285e-02, 3.697e-04, -1.566e-02, -1.649e-02, -2.697e-03), r0);
	r1 = MulAdd(s0_0_2, M4(1.567e-01, -1.790e-02, 1.265e-03, 1.499e-01, 8.410e-02, -3.771e-02, -4.766e-02, -1.852e-02, 8.719e-02, -3.163e-02, 4.318e-02, 2.248e-01, -1.116e-01, 3.596e-02, 7.247e-02, -4.053e-01), r1);
	r2 = MulAdd(s0_0_2, M4(-3.996e-02, 2.926e-02, -1.334e-01, -9.393e-03, -2.310e-02, 1.175e-02, 6.418e-02, -9.978e-03, 1.348e-02, -5.206e-03, 1.179e-01, 5.048e-03, 2.366e-03, 3.495e-02, 2.292e-03, -2.825e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-1.566e-02, 9.339e-03, 2.398e-01, 4.516e-02, 5.502e-03, 1.579e-01, -8.590e-02, 2.838e-02, -3.432e-02, -1.333e-01, 7.881e-02, 1.685e-01, -2.494e-01, 1.004e-01, 1.295e-01, -1.777e-01), r0);
	r1 = MulAdd(s0_1_0, M4(-5.529e-02, -1.743e-01, -1.600e-02, 3.006e-02, -1.649e-02, 1.687e-01, 1.343e-01, 1.091e-02, 1.558e-01, -1.972e-01, 3.818e-02, 2.611e-01, 2.312e-01, 3.221e-02, 4.857e-02, -6.451e-02), r1);
	r2 = MulAdd(s0_1_0, M4(-1.293e-02, 2.188e-02, 1.861e-02, 1.304e-02, -1.124e-01, -8.472e-02, 2.078e-02, -5.474e-02, -9.059e-02, 1.047e-02, 4.283e-01, 8.854e-02, -3.296e-01, 1.392e-01, 2.329e-01, 6.949e-02), r2);
	r0 = MulAdd(s0_1_1, M4(2.407e-01, 1.095e-02, -3.481e-01, 6.020e-02, 9.225e-02, 1.055e-01, -1.694e-01, -2.212e-01, -9.128e-03, -2.613e-01, 2.895e-02, -9.241e-03, 3.991e-03, 1.495e-01, -1.157e-01, -1.323e-01), r0);
	r1 = MulAdd(s0_1_1, M4(2.431e-01, 2.028e-01, 1.762e-01, 3.523e-02, -3.615e-01, 2.292e-01, 1.598e-01, 1.991e-01, 1.827e-01, 2.670e-02, -4.757e-01, -1.609e-01, -1.305e-01, -7.384e-01, -6.952e-01, -3.849e-01), r1);
	r2 = MulAdd(s0_1_1, M4(2.607e-01, 2.432e-01, 2.273e-02, 1.787e-01, 2.939e-01, 8.720e-02, 1.118e-01, 6.963e-02, 6.469e-02, -1.566e-03, -3.076e-01, -9.642e-02, -3.702e-01, -2.907e-02, 2.848e-01, -7.739e-02), r2);
	r0 = MulAdd(s0_1_2, M4(-1.233e-01, -4.466e-03, -4.747e-03, 1.248e-01, -3.747e-02, 6.510e-02, 4.517e-02, 5.404e-02, -2.516e-02, -3.611e-02, -9.053e-02, 3.989e-02, 7.578e-04, -9.800e-03, 1.380e-02, -5.194e-02), r0);
	r1 = MulAdd(s0_1_2, M4(-1.442e-02, 1.388e-04, 8.914e-02, 1.211e-02, 1.809e-01, -1.149e-01, 1.986e-01, 1.294e-01, 7.698e-02, -8.799e-03, -2.692e-02, -7.576e-02, -2.426e-01, 9.821e-04, -3.852e-01, -1.144e-01), r1);
	r2 = MulAdd(s0_1_2, M4(5.533e-02, 6.612e-03, -1.211e-01, 2.766e-02, -1.013e-02, 7.833e-02, -3.847e-02, -1.354e-02, -4.257e-02, 4.314e-03, 1.118e-01, -1.486e-02, -1.562e-02, 4.944e-02, 6.707e-03, 1.726e-03), r2);
	r0 = MulAdd(s0_2_0, M4(-1.828e-02, 3.186e-02, 7.008e-02, 1.467e-02, -5.390e-02, 6.998e-02, -2.090e-01, 1.121e-01, 1.135e-01, 1.392e-01, 2.343e-01, -1.275e-01, -1.597e-02, -2.249e-01, 1.748e-02, 7.336e-03), r0);
	r1 = MulAdd(s0_2_0, M4(8.772e-03, -7.248e-02, 2.408e-02, 2.096e-02, 5.329e-02, 4.772e-02, -1.299e-01, -2.172e-02, 1.979e-01, 9.474e-05, 2.407e-01, 3.252e-01, -5.506e-03, 6.287e-02, -1.730e-03, 3.169e-02), r1);
	r2 = MulAdd(s0_2_0, M4(-9.102e-03, 5.103e-03, -1.744e-02, 1.001e-02, -1.274e-01, -2.264e-02, -1.151e-02, 3.246e-02, 2.776e-01, 7.105e-02, -1.446e-01, 2.476e-02, 4.479e-02, 6.260e-02, -1.338e-03, 2.897e-02), r2);
	r0 = MulAdd(s0_2_1, M4(1.228e-01, 8.960e-02, 8.093e-02, -2.799e-05, 1.190e-01, 1.063e-01, -3.685e-01, 3.667e-01, -2.387e-01, 1.362e-01, 1.210e-01, 8.433e-02, 2.846e-02, -5.068e-01, 5.973e-02, -1.317e-01), r0);
	r1 = MulAdd(s0_2_1, M4(7.571e-02, 2.136e-02, 7.805e-02, 3.014e-02, -7.544e-02, 9.455e-02, 4.176e-02, -1.479e-01, 2.568e-01, 9.643e-02, 9.639e-02, 2.046e-01, -1.374e-01, -7.046e-02, 1.292e-01, 5.363e-02), r1);
	r2 = MulAdd(s0_2_1, M4(5.958e-02, 6.190e-02, 9.003e-04, 3.730e-02, 1.470e-01, 1.538e-01, 1.838e-02, 1.094e-01, 1.116e-01, 6.421e-02, -1.323e-01, 1.187e-01, 8.592e-02, 7.174e-02, -2.447e-03, 6.736e-03), r2);
	r0 = MulAdd(s0_2_2, M4(-6.127e-03, 9.709e-02, 2.057e-02, -4.134e-02, -6.237e-02, 4.134e-02, -1.039e-01, 1.871e-01, 5.981e-02, 5.446e-02, 1.977e-02, -4.733e-03, 2.173e-02, -4.688e-02, -2.097e-03, 7.559e-03), r0);
	r1 = MulAdd(s0_2_2, M4(8.685e-02, 3.885e-02, 1.744e-01, -2.946e-02, -8.860e-02, -7.615e-03, -1.294e-01, 1.775e-02, 4.080e-02, -5.161e-02, 1.075e-01, -3.325e-02, 1.357e-01, -8.583e-03, -5.626e-02, 2.045e-02), r1);
	r2 = MulAdd(s0_2_2, M4(7.202e-02, -6.692e-03, -5.426e-02, -1.738e-02, -1.452e-02, 4.147e-02, -2.179e-03, 3.438e-02, -3.539e-02, 2.192e-02, -5.047e-02, -8.028e-03, 4.465e-02, 3.430e-02, -4.107e-02, 1.032e-02), r2);
	r0 = MulAdd(s1_0_0, M4(-9.155e-02, -6.270e-02, 2.491e-01, -9.078e-03, -1.985e-02, -2.527e-02, 1.294e-02, 1.184e-02, 6.516e-02, -7.880e-02, 2.050e-01, 5.405e-02, -3.738e-03, 2.451e-02, -5.778e-03, 3.930e-03), r0);
	r1 = MulAdd(s1_0_0, M4(-1.140e-01, 3.787e-02, 7.493e-02, 6.210e-02, -4.014e-02, 2.540e-02, 5.631e-02, -1.270e-02, 8.988e-03, -9.654e-02, 4.168e-02, 3.745e-02, -6.064e-02, 5.811e-02, 2.493e-02, -1.385e-01), r1);
	r2 = MulAdd(s1_0_0, M4(-1.342e-01, 6.066e-03, 4.127e-02, 3.284e-02, -7.692e-02, -4.837e-03, 7.426e-02, -1.543e-02, -7.040e-03, -2.499e-02, -6.237e-02, -7.423e-02, -4.926e-02, -3.503e-02, 1.891e-02, -4.812e-02), r2);
	r0 = MulAdd(s1_0_1, M4(-5.188e-02, 5.996e-02, 2.666e-01, 1.267e-01, 4.252e-02, -6.213e-02, 1.026e-02, -1.829e-02, -4.100e-03, -2.456e-01, 6.870e-02, -7.469e-03, -6.306e-02, -2.325e-02, -1.835e-03, -4.983e-02), r0);
	r1 = MulAdd(s1_0_1, M4(1.289e-02, 9.127e-02, 8.528e-02, 1.469e-01, 7.349e-02, -1.160e-02, 5.081e-03, -2.180e-01, -1.665e-01, 6.872e-02, -1.085e-02, 9.080e-02, -4.386e-02, 2.612e-02, -6.812e-03, -1.090e-01), r1);
	r2 = MulAdd(s1_0_1, M4(-2.605e-01, 2.218e-02, 3.228e-01, 1.623e-01, 4.207e-02, -1.131e-02, 5.821e-02, 5.515e-03, 6.305e-02, -4.059e-02, 1.051e-01, 4.794e-02, 3.982e-02, -8.629e-03, -1.727e-02, 3.937e-03), r2);
	r0 = MulAdd(s1_0_2, M4(-6.767e-02, 1.228e-01, 2.071e-02, 1.429e-02, -7.827e-02, -6.816e-02, 2.627e-02, -2.231e-02, -7.160e-02, -6.749e-03, 1.528e-02, 3.940e-02, 1.077e-02, 1.025e-02, -5.558e-03, -1.468e-02), r0);
	r1 = MulAdd(s1_0_2, M4(1.841e-01, -4.603e-02, -3.773e-02, 9.032e-02, 1.051e-01, -5.057e-02, -8.862e-02, -1.925e-01, 1.857e-01, -2.249e-02, -3.799e-01, 2.612e-02, -2.045e-01, 1.740e-02, -9.103e-03, -2.066e-01), r1);
	r2 = MulAdd(s1_0_2, M4(-2.846e-03, 6.666e-02, -1.284e-01, 5.556e-02, -7.713e-03, 6.924e-04, 3.504e-02, -6.498e-03, -1.640e-01, 3.185e-02, -1.319e-01, -6.169e-03, -3.910e-02, -2.686e-02, -2.949e-02, -1.605e-02), r2);
	r0 = MulAdd(s1_1_0, M4(7.939e-02, 3.356e-02, 5.530e-01, 4.903e-02, 8.573e-02, 5.080e-02, 1.974e-01, 8.937e-03, -2.743e-01, 1.401e-01, -7.677e-02, -1.633e-01, -5.816e-02, 7.878e-02, 1.803e-01, 1.909e-02), r0);
	r1 = MulAdd(s1_1_0, M4(5.803e-02, -3.197e-01, 8.781e-02, 8.940e-02, 9.154e-02, 2.225e-02, 1.476e-01, -2.870e-04, -9.272e-02, -2.495e-02, -1.920e-01, -3.183e-02, -6.969e-02, -1.345e-02, 3.260e-02, -7.009e-02), r1);
	r2 = MulAdd(s1_1_0, M4(4.459e-02, 5.366e-02, 4.856e-03, 5.695e-02, 1.046e-01, 1.130e-02, 2.837e-02, -9.776e-03, -2.744e-01, 4.807e-02, 7.303e-02, 8.264e-02, -5.158e-02, -1.575e-02, -3.077e-02, -1.079e-02), r2);
	r0 = MulAdd(s1_1_1, M4(-2.119e-01, -2.433e-01, -8.002e-01, -9.634e-02, -4.159e-01, 2.276e-01, 3.242e-01, 6.349e-02, 5.385e-01, 5.294e-01, -5.681e-01, -1.958e-01, 5.081e-02, 1.950e-03, -3.350e-01, -7.096e-02), r0);
	r1 = MulAdd(s1_1_1, M4(3.944e-01, 5.473e-01, 5.571e-01, 4.248e-01, -2.681e-01, -2.163e-01, 7.107e-02, -1.456e-01, 1.304e-01, -5.414e-02, -1.021e-01, -3.661e-01, -3.188e-02, -4.089e-02, 1.668e-01, -2.340e-01), r1);
	r2 = MulAdd(s1_1_1, M4(1.087e-01, 2.611e-01, -2.388e-01, 5.301e-01, 1.501e-01, 4.633e-02, -2.534e-01, 8.006e-02, 1.818e-01, 5.141e-01, 2.705e-01, 4.945e-02, -1.634e-01, 1.641e-01, 1.622e-01, 1.451e-01), r2);
	r0 = MulAdd(s1_1_2, M4(-6.611e-02, -2.156e-01, -7.073e-03, -1.890e-01, 7.509e-02, 3.447e-01, 1.394e-01, 7.120e-02, -1.499e-01, 3.424e-02, 2.644e-02, -1.152e-01, 7.010e-04, -2.442e-02, -2.530e-02, 4.683e-02), r0);
	r1 = MulAdd(s1_1_2, M4(-1.967e-01, -1.901e-01, 9.396e-02, -1.203e-01, -7.922e-03, 6.985e-02, 1.736e-01, -3.898e-01, 1.512e-03, -4.870e-02, 6.774e-02, 7.830e-02, -2.522e-01, 4.870e-02, -9.542e-02, -2.567e-01), r1);
	r2 = MulAdd(s1_1_2, M4(3.454e-03, 7.145e-02, 5.008e-02, -1.265e-01, 5.799e-02, -2.274e-02, -1.206e-01, 6.433e-02, 2.328e-01, 1.078e-01, -1.514e-01, 8.676e-02, 3.888e-02, 1.065e-02, 2.948e-02, 2.620e-02), r2);
	r0 = MulAdd(s1_2_0, M4(-9.943e-02, 1.274e-01, 2.744e-02, -6.320e-02, 2.240e-02, 3.819e-02, -5.030e-02, 7.096e-03, 3.115e-02, -9.540e-02, 2.267e-02, -2.325e-02, -2.451e-02, -8.073e-02, -1.197e-02, 2.537e-03), r0);
	r1 = MulAdd(s1_2_0, M4(-5.043e-02, -1.310e-01, -1.248e-01, 5.699e-02, 5.065e-02, 5.507e-02, 2.126e-03, 7.604e-02, -1.109e-02, 4.748e-03, 4.480e-02, -7.888e-03, 1.950e-02, -8.252e-02, 1.175e-03, 9.189e-03), r1);
	r2 = MulAdd(s1_2_0, M4(-1.009e-01, 3.767e-02, -6.039e-02, 1.837e-02, -6.330e-02, 2.647e-02, 4.195e-03, -9.567e-03, 6.778e-02, 6.756e-03, -4.456e-02, -5.696e-03, -6.599e-02, 2.700e-02, 3.778e-02, 1.596e-02), r2);
	r0 = MulAdd(s1_2_1, M4(7.110e-02, 1.958e-01, -9.620e-03, -1.475e-01, -2.569e-01, -2.023e-01, 5.212e-02, -6.246e-02, 7.396e-02, -2.228e-01, -1.093e-01, 5.474e-03, -6.186e-02, 3.593e-02, 1.263e-02, 9.349e-02), r0);
	r1 = MulAdd(s1_2_1, M4(3.542e-03, 9.631e-02, -2.951e-02, -1.706e-01, -1.705e-01, -5.293e-01, -2.059e-01, 2.797e-02, 1.571e-01, -3.753e-02, 2.195e-01, 9.552e-02, -2.251e-01, 5.198e-02, -2.562e-01, 6.117e-02), r1);
	r2 = MulAdd(s1_2_1, M4(-1.765e-01, 8.966e-02, 9.960e-02, 1.291e-01, 1.322e-01, -1.170e-01, 3.148e-02, -5.781e-02, 1.486e-01, 1.540e-02, -8.425e-02, -3.068e-02, -1.620e-01, -1.487e-02, -2.292e-02, -1.627e-02), r2);
	r0 = MulAdd(s1_2_2, M4(4.150e-02, 6.568e-02, -5.718e-02, 2.960e-02, -1.978e-01, -3.677e-01, 2.086e-03, -3.175e-02, -8.061e-02, -1.340e-01, -6.160e-02, -2.485e-03, 4.553e-02, 2.958e-02, 1.223e-01, 3.920e-02), r0);
	r1 = MulAdd(s1_2_2, M4(-3.258e-02, -5.477e-03, -3.057e-02, -4.284e-02, -1.197e-02, 9.217e-03, -3.683e-01, 5.082e-02, 2.611e-02, 1.189e-01, 1.939e-01, 2.538e-02, -4.202e-02, 4.599e-02, 1.417e-01, 6.977e-02), r1);
	r2 = MulAdd(s1_2_2, M4(2.921e-02, -3.269e-02, -4.897e-02, 1.156e-01, -1.598e-01, -9.644e-02, -2.202e-02, -3.648e-02, 3.609e-02, 5.847e-02, -4.901e-02, -3.992e-02, 4.314e-02, -4.972e-03, -2.500e-02, 1.170e-02), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(8.959e-02, -2.673e-03, -1.913e-01, -1.969e-02, -1.924e-02, 7.887e-02, 1.479e-01, 3.983e-02, 5.962e-02, 4.671e-03, -5.614e-02, 1.257e-02, -3.285e-02, 1.932e-02, -4.854e-01, -5.070e-02), r0);
	r1 = MulAdd(s0_0_0, M4(1.833e-01, -1.397e-02, -4.846e-02, 6.759e-02, 7.446e-02, 2.880e-02, -6.631e-02, 9.678e-02, 3.559e-02, 5.600e-02, -1.083e-01, -2.261e-01, -1.901e-04, 8.845e-02, -4.150e-02, -6.574e-02), r1);
	r2 = MulAdd(s0_0_0, M4(-2.865e-02, -1.417e-03, 1.587e-01, 2.135e-02, 6.211e-02, 2.407e-03, 5.445e-03, 1.129e-03, 1.947e-01, 1.685e-02, -1.107e-01, -1.760e-02, 1.283e-01, 3.570e-02, 2.804e-02, 9.617e-03), r2);
	r0 = MulAdd(s0_0_1, M4(-7.865e-02, 1.634e-01, 1.912e-01, -7.252e-02, 6.024e-03, -4.853e-02, 1.958e-02, 7.229e-02, -1.205e-01, 2.693e-02, 1.399e-02, -1.265e-01, 1.274e-01, -1.318e-02, -4.794e-01, -3.490e-01), r0);
	r1 = MulAdd(s0_0_1, M4(-1.910e-01, 1.039e-01, -1.583e-01, -5.036e-01, 1.124e-02, -1.249e-02, 1.995e-01, -1.294e-01, -9.577e-02, -3.329e-02, 2.657e-01, 1.559e-01, -4.041e-01, 6.030e-03, 3.768e-01, -5.834e-01), r1);
	r2 = MulAdd(s0_0_1, M4(1.017e-01, -8.637e-03, -2.858e-01, -7.349e-02, 6.686e-02, 6.969e-03, 1.692e-01, -2.241e-02, -3.378e-02, 3.035e-02, 1.345e-02, -2.253e-02, 2.920e-01, -8.633e-02, 1.511e-01, -7.850e-02), r2);
	r0 = MulAdd(s0_0_2, M4(8.925e-02, 2.178e-01, -4.665e-03, 2.277e-02, 3.629e-03, 8.271e-02, -5.387e-03, 2.279e-02, -1.161e-02, -4.201e-02, 5.092e-02, 2.161e-02, 3.574e-02, -1.967e-01, -1.515e-02, -1.112e-01), r0);
	r1 = MulAdd(s0_0_2, M4(9.262e-02, -4.582e-02, 1.278e-01, 1.155e-01, 4.126e-02, 4.902e-02, 8.342e-02, -2.260e-01, 8.453e-03, -7.318e-02, -8.308e-02, 2.059e-01, -1.412e-01, -2.276e-02, -2.710e-01, -1.484e-01), r1);
	r2 = MulAdd(s0_0_2, M4(1.228e-02, -3.395e-02, 2.571e-02, 1.297e-02, 3.723e-02, -2.071e-03, -7.338e-02, 1.827e-02, -8.719e-02, 6.320e-04, -1.144e-02, 3.006e-02, 8.714e-03, -1.242e-02, 1.132e-01, -1.156e-01), r2);
	r0 = MulAdd(s0_1_0, M4(-7.610e-02, 6.763e-02, -2.236e-02, -5.609e-02, 1.824e-01, -1.926e-01, 2.148e-01, -1.868e-02, 4.454e-02, -4.601e-01, -1.264e-01, -1.999e-02, 1.431e-01, -3.745e-02, 8.965e-02, 2.173e-01), r0);
	r1 = MulAdd(s0_1_0, M4(-1.988e-01, -1.109e-01, -1.877e-01, -6.963e-02, 7.082e-02, 1.944e-01, 1.514e-02, 3.279e-01, -3.203e-01, -4.973e-01, -4.664e-01, 7.868e-02, 1.285e-02, -4.477e-02, 1.873e-01, -1.166e-02), r1);
	r2 = MulAdd(s0_1_0, M4(3.886e-02, 4.242e-02, -1.443e-01, -9.022e-03, 4.682e-01, -6.875e-03, 1.126e-01, -2.767e-02, 1.832e-01, 4.116e-02, -3.932e-02, -1.045e-01, -8.465e-03, 4.567e-02, 1.761e-01, -2.580e-02), r2);
	r0 = MulAdd(s0_1_1, M4(1.238e-01, -2.686e-01, -5.612e-02, -2.666e-03, 2.339e-01, 4.346e-01, 1.314e-01, 1.924e-02, -1.509e-01, -4.989e-01, 1.501e-01, 1.200e-01, -2.115e-01, 9.423e-02, 9.274e-02, 7.989e-01), r0);
	r1 = MulAdd(s0_1_1, M4(-7.492e-02, -9.321e-02, -6.738e-01, -5.130e-02, -2.501e-02, -5.977e-02, 2.909e-01, 3.857e-01, 5.058e-01, 1.431e-01, 3.066e-02, -5.759e-02, -2.118e-01, 1.663e-01, -5.723e-01, -1.153e-01), r1);
	r2 = MulAdd(s0_1_1, M4(-3.235e-01, -2.121e-01, 2.569e-01, -1.528e-01, -2.677e-01, -6.370e-02, -8.657e-02, -8.176e-02, -2.186e-01, -1.078e-01, -6.007e-02, 2.065e-01, 2.106e-01, -8.228e-02, -5.626e-02, 3.467e-01), r2);
	r0 = MulAdd(s0_1_2, M4(8.197e-03, -1.294e-01, 1.733e-02, 1.931e-02, -4.728e-02, -9.164e-03, -2.376e-02, 1.941e-02, -2.734e-04, -8.556e-02, 4.309e-02, -1.098e-01, 7.436e-02, 2.992e-01, -1.200e-01, 1.360e-01), r0);
	r1 = MulAdd(s0_1_2, M4(-1.536e-02, -3.847e-02, -7.493e-02, -1.711e-01, -2.138e-02, 1.281e-02, -1.903e-02, 2.574e-01, -1.406e-01, 3.598e-02, 4.663e-02, -2.151e-01, 2.252e-01, -4.811e-02, 3.256e-01, 1.356e-01), r1);
	r2 = MulAdd(s0_1_2, M4(-2.039e-02, -1.513e-02, 5.000e-02, -3.159e-02, 6.020e-02, 5.200e-03, -6.370e-02, 6.347e-03, -1.698e-01, -1.361e-02, 7.443e-02, -3.406e-02, 1.949e-01, -1.162e-03, 2.776e-02, 1.013e-01), r2);
	r0 = MulAdd(s0_2_0, M4(1.312e-02, -1.763e-01, 8.862e-02, 2.623e-02, 1.306e-01, 2.339e-01, 1.047e-01, -8.203e-02, 6.696e-02, 2.921e-01, 2.853e-02, -1.762e-01, 7.174e-03, -1.132e-01, -4.945e-03, 3.782e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-1.510e-01, -1.849e-01, -2.498e-01, 3.149e-02, 6.091e-02, 1.929e-01, 2.643e-01, -1.762e-02, -1.128e-01, -1.899e-02, 3.586e-02, -5.449e-02, 2.180e-02, 1.740e-02, 1.439e-01, -7.136e-02), r1);
	r2 = MulAdd(s0_2_0, M4(-1.213e-02, -2.969e-02, 1.346e-02, -5.552e-02, 1.542e-02, 1.388e-01, -3.673e-02, -3.479e-02, -4.436e-02, -1.464e-02, 6.454e-02, -9.510e-02, 4.267e-02, 4.176e-02, 8.392e-03, -1.376e-02), r2);
	r0 = MulAdd(s0_2_1, M4(6.466e-02, 2.391e-02, -3.201e-02, -4.538e-02, 2.452e-02, -4.810e-01, -2.658e-02, -1.891e-01, -1.512e-01, 5.722e-01, 1.338e-01, -1.950e-01, -4.342e-04, -2.209e-01, -9.457e-02, -1.311e-01), r0);
	r1 = MulAdd(s0_2_1, M4(5.958e-02, 1.147e-02, -1.364e-01, 1.638e-02, 1.714e-01, 5.620e-02, -3.150e-02, 2.378e-02, -1.199e-01, -2.284e-01, -2.998e-01, 8.139e-02, 8.701e-02, 4.114e-02, -2.757e-01, 1.158e-01), r1);
	r2 = MulAdd(s0_2_1, M4(7.009e-03, -7.951e-02, 2.208e-02, -3.064e-02, -9.151e-03, -7.640e-02, -4.321e-03, -1.093e-01, -1.043e-01, 3.132e-01, -5.674e-02, -9.430e-02, 1.231e-02, -5.795e-02, -5.944e-03, -2.968e-04), r2);
	r0 = MulAdd(s0_2_2, M4(-2.373e-02, -8.691e-03, 1.635e-02, -3.386e-03, -3.621e-02, -1.071e-01, 3.716e-02, -8.890e-02, 3.590e-02, 1.002e-01, 4.824e-02, -5.250e-05, 8.642e-03, -1.223e-01, -1.065e-02, 5.611e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-3.300e-02, 2.259e-02, -7.543e-02, 5.432e-02, -2.628e-02, -5.818e-02, -1.636e-01, -5.271e-02, 1.063e-01, 1.479e-02, -8.713e-02, 2.314e-02, -8.093e-02, 2.952e-02, 2.613e-01, -9.142e-02), r1);
	r2 = MulAdd(s0_2_2, M4(4.313e-02, 8.823e-03, -3.201e-02, -1.316e-02, -2.353e-02, 7.162e-03, -3.505e-02, -1.920e-02, -8.286e-02, 3.308e-02, 7.691e-02, -5.758e-02, 9.153e-03, -8.647e-03, 1.455e-03, 4.797e-02), r2);
	r0 = max(r0, 0.0);
	T3[gxy] = r0;
	r1 = max(r1, 0.0);
	T4[gxy] = r1;
	r2 = max(r2, 0.0);
	T5[gxy] = r2;
}

//!PASS 5
//!DESC conv4 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T3, T4, T5
//!OUT T0, T1, T2

#define L0(x, y) V4(O(T3, x, y))
#define L1(x, y) V4(O(T4, x, y))
#define L2(x, y) V4(O(T5, x, y))

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.727e-02, 1.309e-04, -2.908e-02, -1.519e-01, 1.611e-02, 2.656e-02, 3.036e-02, 3.973e-02, -2.100e-02, 7.087e-02, 9.405e-03, -2.025e-01, 4.330e-02, 3.116e-02, 5.007e-02, -1.908e-02), r0);
	r1 = MulAdd(s0_0_0, M4(1.694e-02, -2.915e-02, 1.031e-01, -3.175e-02, 6.607e-03, 3.992e-02, -3.433e-02, 5.501e-02, -1.906e-02, -1.098e-02, 9.639e-02, -5.927e-02, 2.369e-02, 5.583e-02, -2.333e-02, 9.832e-02), r1);
	r2 = MulAdd(s0_0_0, M4(-7.803e-02, -3.739e-02, -1.933e-02, 1.024e-01, -1.521e-01, 3.436e-02, 4.797e-02, 3.213e-01, -3.358e-03, -1.906e-02, -2.163e-02, -9.750e-03, -1.862e-01, 6.813e-02, -4.663e-02, 4.062e-01), r2);
	r0 = MulAdd(s0_0_1, M4(-1.375e-02, -2.107e-02, -3.257e-02, 1.547e-05, 2.437e-01, 2.932e-02, 5.787e-02, 3.862e-02, -4.313e-02, 2.122e-02, -6.594e-02, -5.287e-01, 2.565e-01, -3.315e-01, 5.585e-01, 6.223e-02), r0);
	r1 = MulAdd(s0_0_1, M4(-4.478e-02, -2.009e-02, -1.818e-01, 4.229e-01, 1.890e-01, 3.088e-01, -2.115e-01, 2.632e-01, 6.740e-03, -1.149e-01, -2.338e-01, -8.131e-02, 1.179e-01, 5.189e-01, -6.871e-02, 6.572e-02), r1);
	r2 = MulAdd(s0_0_1, M4(9.544e-02, -2.310e-02, 1.319e-02, -2.155e-03, 1.657e-01, 1.275e-01, 9.626e-02, 1.320e-01, -5.581e-02, -1.464e-02, -1.773e-02, -8.000e-02, -3.091e-01, 2.104e-01, 2.885e-01, 2.329e-01), r2);
	r0 = MulAdd(s0_0_2, M4(-5.065e-02, -5.446e-02, -4.944e-04, -4.034e-02, 2.173e-01, -1.188e+00, 3.492e-02, 8.569e-02, -2.253e-01, -1.364e-01, -2.281e-02, 4.662e-02, 7.683e-02, 3.363e-01, 2.751e-02, 5.878e-02), r0);
	r1 = MulAdd(s0_0_2, M4(-3.541e-02, 4.402e-02, 1.278e-01, -5.958e-05, 2.019e-01, -2.443e-02, 7.934e-02, -2.750e-02, -1.170e-01, 4.724e-02, -3.210e-01, -2.888e-02, 9.608e-02, -3.064e-03, -1.704e-01, 5.073e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-5.317e-02, 1.861e-02, -3.147e-02, 1.764e-02, 6.342e-02, -1.906e-02, 1.107e-01, 6.477e-03, 3.684e-02, -1.680e-02, -3.109e-02, 6.510e-03, 2.112e-02, 1.938e-02, -4.183e-02, -2.323e-03), r2);
	r0 = MulAdd(s0_1_0, M4(1.369e-02, -1.626e-01, -2.324e-03, -4.431e-02, -3.652e-02, -2.324e-01, 9.196e-03, -4.186e-02, -3.299e-03, 7.190e-03, 2.213e-02, -9.451e-02, -3.175e-02, 3.957e-01, 7.511e-02, 1.300e-01), r0);
	r1 = MulAdd(s0_1_0, M4(4.700e-02, 4.942e-04, 1.577e-01, -7.593e-02, 1.132e-02, 1.635e-02, 5.359e-02, -1.383e-02, 1.741e-02, -4.700e-02, 2.583e-02, -2.823e-02, -2.998e-02, -5.311e-02, 9.678e-02, -6.823e-03), r1);
	r2 = MulAdd(s0_1_0, M4(6.940e-02, 3.588e-02, -5.945e-02, -2.764e-01, -1.241e-01, -1.561e-02, -3.800e-02, -1.604e-01, 6.185e-03, -4.971e-02, -4.575e-02, -1.440e-01, -4.246e-01, -1.055e-01, -5.973e-02, 1.199e-01), r2);
	r0 = MulAdd(s0_1_1, M4(2.337e-03, -1.924e-01, -3.161e-01, -9.626e-02, -1.748e-01, -2.902e-01, 5.678e-02, -2.327e-02, 1.482e-02, -1.355e+00, -1.255e-01, -1.991e-01, -2.053e-01, -2.881e-01, -2.781e-01, 1.659e-01), r0);
	r1 = MulAdd(s0_1_1, M4(-3.018e-01, 5.700e-01, -4.147e-02, 2.329e-01, -1.323e-01, -3.702e-01, -5.547e-02, -1.158e-01, -1.180e-01, -3.622e-01, -1.965e-01, -2.838e-01, 7.697e-02, 2.177e-01, 2.398e-02, 7.737e-02), r1);
	r2 = MulAdd(s0_1_1, M4(2.881e-01, 2.156e-01, -6.043e-03, 2.901e-01, -2.456e-01, 2.506e-02, 2.579e-02, -1.253e-01, -6.158e-01, -7.861e-02, 1.898e-02, -6.450e-01, 2.607e-01, -1.314e-01, -1.252e-01, 1.173e-01), r2);
	r0 = MulAdd(s0_1_2, M4(1.588e-01, 1.607e+00, -2.734e-02, -4.375e-02, -2.085e-01, 1.261e+00, 4.343e-02, -3.755e-02, -3.069e-01, -2.184e+00, -2.134e-01, -5.473e-02, -9.711e-02, -2.158e-01, 2.559e-02, -4.767e-02), r0);
	r1 = MulAdd(s0_1_2, M4(2.075e-01, -1.688e-01, 1.132e-01, -4.721e-02, -1.274e-01, 4.799e-02, 3.502e-02, 1.615e-03, -3.057e-01, -2.144e-01, -2.676e-01, -8.032e-02, -5.124e-02, 3.688e-02, -2.506e-02, -2.667e-02), r1);
	r2 = MulAdd(s0_1_2, M4(-9.215e-02, 1.764e-02, 1.775e-01, -4.549e-03, 5.221e-02, 1.022e-02, -5.686e-02, 2.772e-03, 5.294e-02, -1.681e-02, -3.037e-01, 2.625e-02, 3.732e-02, 6.646e-03, -9.539e-02, 1.593e-02), r2);
	r0 = MulAdd(s0_2_0, M4(9.284e-03, 4.993e-02, -1.048e-02, 2.463e-03, -5.395e-03, 1.577e-01, -3.834e-02, 3.186e-02, -5.844e-03, 6.341e-02, 6.016e-03, 1.367e-02, -1.788e-02, 8.847e-02, 2.978e-02, 1.645e-03), r0);
	r1 = MulAdd(s0_2_0, M4(1.667e-02, -3.454e-02, -2.804e-02, 5.984e-03, 7.738e-03, -2.588e-03, -2.645e-03, 7.407e-03, -9.127e-04, 2.816e-04, 8.593e-03, 1.721e-03, 7.138e-03, 3.430e-02, 2.528e-02, 5.629e-03), r1);
	r2 = MulAdd(s0_2_0, M4(-3.774e-02, -2.831e-02, -2.061e-02, 2.291e-02, 1.816e-02, -1.343e-02, 3.851e-03, 5.958e-03, -1.035e-01, -9.970e-03, -4.731e-03, 3.237e-02, -3.690e-02, 2.673e-03, -1.758e-02, -3.816e-03), r2);
	r0 = MulAdd(s0_2_1, M4(4.505e-02, 2.611e-01, 3.137e-02, 4.067e-02, -4.618e-02, -4.364e-01, -3.042e-02, 4.705e-03, -3.033e-02, -3.956e+00, -5.441e-02, -4.442e-02, 6.195e-02, -1.666e-01, 3.143e-02, 1.678e-02), r0);
	r1 = MulAdd(s0_2_1, M4(1.911e-02, -1.353e-01, -2.558e-02, -3.356e-02, 1.788e-02, 4.101e-02, 6.096e-03, 2.772e-02, 1.872e-02, -4.771e-02, -2.336e-02, -3.046e-03, -2.498e-02, -3.830e-02, -4.695e-02, -2.900e-02), r1);
	r2 = MulAdd(s0_2_1, M4(9.354e-02, -7.933e-02, 1.372e-02, -7.399e-02, 3.586e-02, -3.328e-02, -5.581e-02, 6.134e-02, -2.763e-01, -7.891e-02, 3.334e-03, 1.541e-02, -4.461e-02, 2.501e-02, 2.335e-02, -3.880e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-6.245e-02, -1.883e+00, 3.229e-02, -3.081e-03, -2.046e-02, -5.972e-01, -4.816e-03, 2.912e-02, 8.596e-02, -6.499e+00, -1.353e-01, -2.070e-03, -1.564e-02, -1.813e+00, -1.290e-02, 2.735e-04), r0);
	r1 = MulAdd(s0_2_2, M4(-1.923e-02, -2.428e-02, 1.042e-02, -3.710e-03, 3.888e-02, 2.809e-02, 3.450e-02, 9.885e-03, 6.890e-03, 2.714e-02, 3.798e-02, 1.434e-02, -2.555e-02, -3.430e-02, 6.958e-03, -4.325e-03), r1);
	r2 = MulAdd(s0_2_2, M4(-3.717e-02, 2.387e-02, 6.532e-02, -3.033e-02, 1.984e-02, -1.703e-02, 1.334e-02, 1.720e-02, 5.117e-02, 2.950e-02, 6.385e-02, -1.740e-02, 7.149e-02, 3.934e-03, -1.840e-02, 2.502e-04), r2);
	r0 = MulAdd(s1_0_0, M4(2.592e-02, 5.290e-02, -4.970e-03, -2.054e-01, -3.755e-02, 6.851e-02, 2.846e-02, -6.425e-02, 2.337e-02, -1.741e-03, 4.046e-03, 3.506e-01, 2.250e-02, 1.069e-02, -2.982e-03, 2.453e-01), r0);
	r1 = MulAdd(s1_0_0, M4(2.055e-02, 7.572e-03, 9.458e-02, -7.228e-03, -1.751e-02, 2.368e-02, -1.224e-02, 7.985e-02, 4.653e-02, 2.804e-02, 1.246e-01, 6.385e-03, -4.385e-03, 1.936e-02, 2.964e-02, 2.826e-02), r1);
	r2 = MulAdd(s1_0_0, M4(1.951e-02, -1.862e-02, 4.773e-02, 5.444e-02, 2.359e-01, 1.292e-02, 1.690e-02, 1.086e-01, 1.928e-01, 3.997e-02, -7.160e-03, 6.125e-02, 5.878e-02, -1.612e-02, 1.681e-02, -6.123e-03), r2);
	r0 = MulAdd(s1_0_1, M4(-1.151e-01, 5.422e-02, 1.861e-02, -5.928e-02, 7.049e-03, 4.892e-02, 2.489e-02, 2.806e-02, 8.529e-02, -3.048e-02, -5.713e-02, 2.156e-01, 6.728e-02, -7.261e-03, -1.885e-02, 2.790e-01), r0);
	r1 = MulAdd(s1_0_1, M4(-1.532e-01, -5.308e-02, -9.154e-02, -3.509e-02, -3.038e-02, 1.440e-01, 3.129e-01, 3.508e-01, 1.261e-01, -1.812e-02, -1.900e-02, 1.479e-01, 8.767e-02, 7.027e-03, -5.286e-02, 2.816e-03), r1);
	r2 = MulAdd(s1_0_1, M4(7.426e-03, 5.510e-02, -3.696e-02, -6.785e-02, 9.618e-03, 8.992e-02, 5.240e-02, -1.802e-01, -7.977e-02, -3.152e-02, 7.447e-02, -2.082e-02, -2.690e-02, 2.268e-02, 3.233e-02, 1.737e-02), r2);
	r0 = MulAdd(s1_0_2, M4(4.555e-02, 1.479e-02, 2.967e-02, -1.442e-02, 7.180e-03, -3.565e-01, -2.763e-02, -1.111e-02, -9.671e-03, -1.544e-01, -9.955e-04, 1.196e-01, 2.007e-02, 1.856e-01, 6.616e-03, 9.224e-02), r0);
	r1 = MulAdd(s1_0_2, M4(6.568e-02, -1.824e-02, 1.297e-01, -6.522e-02, 1.606e-01, -6.909e-02, 3.798e-02, -1.491e-01, -2.276e-02, 8.422e-03, 9.013e-02, -2.776e-03, 6.228e-02, 1.263e-02, 9.653e-02, 3.601e-02), r1);
	r2 = MulAdd(s1_0_2, M4(7.072e-02, -1.715e-02, 4.973e-02, -1.584e-02, 5.872e-02, 2.523e-03, 1.673e-02, 1.146e-02, 3.654e-02, 5.951e-03, -1.643e-02, 1.579e-02, 1.871e-02, -1.168e-02, 1.643e-02, 1.609e-02), r2);
	r0 = MulAdd(s1_1_0, M4(5.519e-03, 1.645e-01, 1.150e-02, 2.544e-01, -7.617e-03, 2.704e-01, -9.432e-03, -5.943e-02, -4.611e-02, -5.992e-02, 2.906e-02, -1.153e-01, 2.460e-02, 2.195e-01, -2.071e-02, 1.810e-01), r0);
	r1 = MulAdd(s1_1_0, M4(1.409e-02, -1.733e-01, 7.971e-02, -7.440e-02, -1.636e-02, 8.569e-02, -3.073e-02, 6.265e-02, 4.601e-02, -1.929e-01, 5.106e-02, -3.232e-02, 5.240e-02, -3.524e-02, 1.381e-01, -1.161e-01), r1);
	r2 = MulAdd(s1_1_0, M4(4.033e-01, 3.937e-02, -5.188e-02, 1.268e-02, 5.325e-02, -4.316e-02, 4.137e-02, -5.142e-02, 3.916e-01, -2.902e-02, -9.443e-02, -2.192e-01, 2.729e-02, -1.087e-01, -5.653e-02, -2.381e-01), r2);
	r0 = MulAdd(s1_1_1, M4(1.480e-01, -8.862e-02, -9.211e-02, 6.054e-02, -5.846e-02, -4.487e-02, 2.418e-02, -5.894e-02, 5.740e-02, 1.811e-01, 2.893e-01, 4.721e-02, -1.842e-01, 6.966e-02, 2.224e-01, 1.560e-01), r0);
	r1 = MulAdd(s1_1_1, M4(-7.771e-02, 2.714e-01, 7.491e-02, 1.421e-01, 7.651e-02, -7.151e-02, 1.958e-01, -1.226e-01, -2.065e-01, 9.061e-02, -1.484e-01, 5.903e-05, -1.793e-01, -5.562e-02, -8.801e-02, 1.777e-01), r1);
	r2 = MulAdd(s1_1_1, M4(-2.798e-01, 4.018e-02, 1.512e-01, 7.837e-02, -4.840e-01, 4.873e-01, -1.994e-01, 7.817e-02, -2.065e-01, 1.152e-01, 4.626e-01, 1.076e-01, -2.290e-03, 9.856e-02, -1.259e-01, 4.734e-02), r2);
	r0 = MulAdd(s1_1_2, M4(-1.851e-01, 3.546e-01, -2.360e-02, -3.157e-02, 4.399e-02, -4.510e-02, -7.097e-02, 8.825e-02, 3.637e-02, 9.907e-01, 5.249e-03, 2.344e-02, 1.238e-01, -1.703e-01, 5.481e-02, 4.978e-02), r0);
	r1 = MulAdd(s1_1_2, M4(5.577e-03, -4.816e-02, 2.052e-02, -4.578e-02, 1.899e-01, 5.360e-02, 3.776e-02, 8.226e-02, 1.791e-02, -1.368e-02, 1.725e-01, -1.167e-02, 1.399e-01, 8.155e-03, 7.052e-02, -2.339e-02), r1);
	r2 = MulAdd(s1_1_2, M4(3.190e-02, -5.023e-03, -1.304e-01, 2.027e-03, 3.478e-01, -1.047e-01, -1.873e-01, -2.954e-02, 6.768e-02, 3.294e-02, -5.822e-02, 1.713e-03, 1.237e-01, 1.797e-02, 5.842e-02, 5.174e-03), r2);
	r0 = MulAdd(s1_2_0, M4(-2.003e-03, -1.878e-01, -3.350e-02, -7.598e-03, 1.776e-03, -8.857e-02, -4.625e-03, 3.503e-02, -1.505e-02, -2.292e-01, 5.102e-03, 4.941e-02, 8.894e-02, 1.588e-01, 4.509e-02, 8.071e-02), r0);
	r1 = MulAdd(s1_2_0, M4(-7.560e-03, 6.199e-02, -1.393e-02, 2.012e-02, -4.952e-03, 6.089e-03, 2.127e-02, -8.758e-04, 1.181e-02, 2.480e-02, 3.556e-02, -6.715e-03, 7.153e-02, 1.448e-01, 3.162e-03, 1.081e-01), r1);
	r2 = MulAdd(s1_2_0, M4(4.267e-01, 8.376e-02, -4.354e-02, -4.583e-02, 1.665e-01, 1.750e-02, 3.358e-02, 2.881e-02, 1.068e-01, 4.153e-03, -2.303e-02, 7.735e-02, -8.411e-02, 1.859e-02, -2.775e-02, 3.721e-01), r2);
	r0 = MulAdd(s1_2_1, M4(-2.631e-02, 4.419e-02, 8.785e-02, 4.309e-02, 1.655e-02, -2.870e-01, -1.023e-01, 4.507e-02, -1.242e-02, 3.545e-01, -3.836e-02, 2.087e-02, 1.676e-01, 2.823e-02, 3.934e-02, 5.054e-02), r0);
	r1 = MulAdd(s1_2_1, M4(-3.722e-02, -2.585e-01, 2.661e-02, -1.013e-01, 1.498e-02, 8.765e-02, -5.460e-02, 1.846e-02, 8.534e-03, 3.968e-02, -5.335e-02, 3.123e-02, 2.433e-01, 2.532e-01, -7.664e-02, 4.476e-02), r1);
	r2 = MulAdd(s1_2_1, M4(-3.999e-02, -8.131e-02, -3.012e-02, -4.403e-02, 1.557e-01, 1.130e-02, -8.872e-02, 4.311e-03, -1.587e-01, -1.466e-02, -2.075e-02, -4.688e-03, -2.590e-01, -3.206e-02, 4.179e-01, 7.117e-03), r2);
	r0 = MulAdd(s1_2_2, M4(-5.890e-03, -2.237e+00, -4.558e-02, 4.035e-02, -4.194e-02, -3.079e+00, 4.219e-03, 4.099e-02, -2.550e-02, -9.273e-01, 1.721e-02, 2.350e-02, -2.708e-01, -8.572e-01, 4.485e-02, 2.306e-02), r0);
	r1 = MulAdd(s1_2_2, M4(-8.757e-02, -3.185e-02, 4.265e-02, -1.376e-02, 4.782e-02, -1.998e-03, 1.938e-03, 2.367e-03, 2.203e-02, 5.851e-03, 5.382e-02, -1.014e-04, -1.137e-02, 2.299e-02, -2.705e-02, 6.238e-03), r1);
	r2 = MulAdd(s1_2_2, M4(2.018e-02, -4.529e-02, -8.522e-02, -2.110e-03, 3.978e-02, 4.122e-02, 8.515e-02, 2.388e-02, 5.390e-02, -2.722e-02, -1.532e-02, 1.224e-02, 1.019e-01, 1.801e-02, -8.281e-02, 2.148e-02), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.263e-03, 2.741e-02, 2.329e-02, 1.351e-01, 3.967e-02, -1.866e-01, -1.657e-02, 1.982e-01, -4.327e-03, -8.622e-03, -1.404e-03, -2.427e-01, -2.887e-02, 3.779e-02, 4.954e-02, 2.466e-01), r0);
	r1 = MulAdd(s0_0_0, M4(-9.733e-03, 6.460e-03, -3.133e-02, 6.615e-03, 2.061e-02, 6.564e-04, -1.516e-01, 1.003e-01, -1.632e-02, -6.257e-04, 2.345e-02, -2.137e-02, -4.705e-02, -9.021e-03, 4.338e-02, -6.600e-02), r1);
	r2 = MulAdd(s0_0_0, M4(2.295e-02, -3.476e-03, 3.499e-03, -1.631e-02, 1.319e-01, 4.040e-02, 6.247e-02, -1.218e-01, 1.957e-02, -4.411e-03, -8.572e-03, -1.163e-02, -4.662e-02, 2.717e-02, -4.264e-02, 1.004e-01), r2);
	r0 = MulAdd(s0_0_1, M4(2.860e-02, -4.100e-02, -8.437e-03, 2.041e-01, 3.837e-01, 3.348e-02, -1.304e-01, -1.701e-01, -3.436e-03, -1.470e-02, -6.076e-03, -2.764e-01, -2.390e-01, 9.583e-02, 1.471e-01, -2.697e-01), r0);
	r1 = MulAdd(s0_0_1, M4(-8.358e-03, 2.240e-02, 2.912e-01, -3.149e-02, 4.036e-01, -7.200e-02, 6.230e-01, 2.561e-01, -2.414e-02, 1.463e-02, -1.826e-01, -4.012e-02, -2.280e-01, 3.591e-02, 1.616e-01, -2.280e-01), r1);
	r2 = MulAdd(s0_0_1, M4(4.495e-02, -4.993e-03, 2.791e-02, 4.629e-02, -1.499e-01, -5.319e-02, 1.879e-01, -1.526e-01, -5.977e-02, 1.249e-02, -1.724e-02, 4.554e-02, 1.989e-01, 1.654e-02, -1.333e-01, 9.448e-02), r2);
	r0 = MulAdd(s0_0_2, M4(7.066e-02, -2.294e-01, 2.536e-02, 7.966e-02, -5.189e-02, -4.871e-01, -1.172e-02, -7.611e-02, 7.430e-03, 1.232e-01, 1.797e-02, 6.102e-02, -8.555e-02, -2.012e-01, 2.093e-02, -1.189e-01), r0);
	r1 = MulAdd(s0_0_2, M4(6.616e-02, 1.436e-02, -5.392e-02, 2.411e-02, -6.518e-02, -3.071e-02, -3.115e-01, -2.899e-02, -7.897e-03, 1.232e-02, 2.239e-02, -3.292e-03, -1.233e-01, 1.923e-02, -2.488e-01, 2.111e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-3.384e-02, -7.344e-03, 3.716e-02, 1.085e-02, 3.150e-03, -8.980e-03, -3.600e-02, -2.657e-02, 4.675e-02, 1.428e-02, 1.490e-02, 2.248e-02, -5.460e-02, -9.132e-03, -1.018e-02, 1.022e-02), r2);
	r0 = MulAdd(s0_1_0, M4(4.099e-03, 8.081e-02, 4.480e-02, -1.091e-01, -8.658e-02, 3.918e-02, -1.480e-01, -2.024e-02, -2.075e-03, 1.265e-01, 2.178e-02, -9.243e-02, 2.124e-02, -1.919e-01, 3.432e-02, -2.415e-01), r0);
	r1 = MulAdd(s0_1_0, M4(-2.508e-02, 1.005e-02, 1.661e-02, 1.767e-02, -2.300e-02, 6.469e-02, -9.147e-02, 8.080e-02, -1.122e-02, 5.159e-03, 4.370e-02, -3.925e-02, -4.100e-02, 3.982e-05, 4.827e-02, -3.869e-02), r1);
	r2 = MulAdd(s0_1_0, M4(7.392e-02, 1.863e-02, 7.630e-03, 1.177e-01, 4.016e-01, 7.014e-02, -1.869e-03, 2.218e-01, -1.065e-01, 2.574e-03, -1.601e-02, -3.698e-02, -1.588e-02, -4.063e-02, -9.446e-02, -1.276e-01), r2);
	r0 = MulAdd(s0_1_1, M4(-6.616e-02, 2.173e-01, 3.427e-01, -2.007e-01, -3.127e-01, 2.807e-01, 9.354e-02, 3.853e-02, -2.302e-01, -1.659e-01, -8.423e-02, 2.401e-01, 2.407e-01, 2.665e-02, 1.685e-01, -5.188e-02), r0);
	r1 = MulAdd(s0_1_1, M4(1.411e-01, -4.164e-02, 8.621e-02, 3.191e-02, 3.041e-01, 3.877e-01, 1.353e-01, 1.323e-01, -1.537e-01, -8.718e-02, -8.590e-01, 6.315e-04, -1.470e-01, -1.930e-01, -3.335e-01, -8.103e-02), r1);
	r2 = MulAdd(s0_1_1, M4(8.862e-02, 1.172e-01, -5.668e-02, -2.861e-01, -1.569e-01, -1.772e-01, 3.521e-02, 1.749e-01, -4.234e-01, -7.495e-02, -1.284e-01, 9.243e-04, 6.868e-02, 4.415e-01, 3.861e-01, -1.030e-01), r2);
	r0 = MulAdd(s0_1_2, M4(-1.107e-01, 1.425e-01, -4.724e-02, -1.201e-01, 1.477e-02, 9.205e-01, -5.871e-02, 2.813e-02, -2.136e-02, -1.873e-01, 2.642e-03, -6.517e-04, 1.126e-01, 1.253e-01, -4.987e-02, 1.736e-02), r0);
	r1 = MulAdd(s0_1_2, M4(-1.465e-01, -7.470e-02, -1.292e-01, -7.691e-02, 1.479e-01, 2.410e-02, -3.100e-02, 1.533e-02, -5.778e-02, 2.365e-03, 1.016e-01, -5.000e-02, 2.144e-01, -4.730e-02, 2.723e-02, 2.220e-02), r1);
	r2 = MulAdd(s0_1_2, M4(4.807e-03, -4.431e-02, -4.825e-02, -1.865e-02, -6.382e-02, 1.001e-02, -1.170e-02, 1.815e-02, 1.019e-01, 1.340e-02, -6.091e-02, -3.948e-03, -1.961e-01, 2.424e-02, 3.617e-01, -4.162e-02), r2);
	r0 = MulAdd(s0_2_0, M4(4.317e-03, 1.920e-01, 2.208e-02, 1.454e-02, -2.021e-02, -4.211e-01, 2.895e-02, 1.412e-02, 3.150e-02, 4.403e-02, 4.675e-02, 3.833e-03, 3.211e-02, -2.832e-01, 3.071e-02, 1.675e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-2.176e-02, -2.079e-03, -3.389e-03, 4.453e-04, -1.403e-02, 1.311e-02, 5.519e-03, -8.438e-03, 9.430e-04, -5.334e-02, 5.989e-02, -3.272e-02, -8.549e-03, -4.421e-02, 9.165e-05, -2.155e-02), r1);
	r2 = MulAdd(s0_2_0, M4(2.348e-02, -2.966e-02, 3.112e-03, 6.342e-03, 1.295e-02, 9.636e-03, -1.194e-02, -2.517e-02, -2.147e-01, -1.435e-02, 7.119e-03, 1.162e-02, -1.213e-01, -1.593e-02, -1.763e-02, -3.528e-02), r2);
	r0 = MulAdd(s0_2_1, M4(-9.186e-03, -5.112e-01, 5.398e-02, -1.363e-02, 2.124e-02, 4.742e-01, 2.735e-03, 1.757e-02, 1.548e-01, -7.919e-01, -1.661e-02, -1.009e-01, -8.376e-02, 4.669e-01, -1.596e-01, -4.622e-02), r0);
	r1 = MulAdd(s0_2_1, M4(-5.122e-03, 2.249e-02, 6.852e-02, -1.145e-02, -1.312e-02, -6.100e-02, -9.264e-03, -2.166e-02, -8.375e-02, -9.536e-03, -5.806e-02, -5.881e-02, -5.741e-03, -3.332e-02, -4.204e-03, 3.697e-03), r1);
	r2 = MulAdd(s0_2_1, M4(-1.627e-01, -7.544e-02, -5.823e-02, 6.148e-02, -1.517e-01, 1.079e-02, 3.550e-02, -4.430e-02, -3.532e-01, 3.638e-02, -2.840e-02, -9.497e-02, 1.811e-01, 2.040e-02, -8.878e-03, -9.786e-03), r2);
	r0 = MulAdd(s0_2_2, M4(2.905e-02, -3.342e+00, -3.869e-02, 1.365e-02, 1.235e-02, -7.575e+00, -2.342e-03, -1.785e-02, -6.815e-04, -8.658e+00, 5.212e-02, -1.059e-02, -5.092e-02, -7.297e+00, 2.950e-02, -3.657e-02), r0);
	r1 = MulAdd(s0_2_2, M4(2.125e-02, 4.311e-02, -1.509e-02, 2.868e-02, -6.202e-02, -2.071e-02, -2.942e-02, -1.605e-02, -1.129e-01, -4.383e-02, -2.985e-02, -1.104e-03, -2.188e-02, 2.435e-02, 3.337e-03, -1.429e-03), r1);
	r2 = MulAdd(s0_2_2, M4(-2.892e-02, 7.568e-03, -1.083e-02, 6.437e-04, 2.932e-02, 1.051e-03, -3.748e-02, -1.151e-02, 7.268e-02, -6.012e-03, -1.184e-01, -1.024e-02, -6.112e-02, -3.224e-02, -1.099e-02, -4.542e-03), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 6
//!DESC out-shuffle (12x12)
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, T0, T1, T2
//!OUT OUTPUT

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 sz = GetOutputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = ((gxy >> 1) + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.277e-02, 4.422e-03, 4.234e-04, -6.235e-06, 1.498e-02, 6.211e-02, 3.837e-02, 1.085e-01, -4.602e-02, -6.198e-03, -9.613e-03, 3.447e-03, 3.057e-04, 6.855e-04, 1.840e-05, 1.363e-04), r0);
	r1 = MulAdd(s0_0_0, M4(-2.569e-02, 2.376e-03, -9.484e-04, -1.607e-03, -7.072e-02, -4.634e-02, 3.653e-02, 4.040e-02, -5.606e-02, -1.035e-02, -1.187e-02, 1.762e-03, 4.648e-04, 6.095e-04, -8.435e-05, 1.601e-05), r1);
	r2 = MulAdd(s0_0_0, M4(-1.886e-02, 4.555e-03, -3.514e-04, 7.618e-04, -3.683e-02, -7.033e-02, 1.180e-02, -7.946e-03, -4.448e-02, -6.237e-03, -8.300e-03, 4.204e-03, 7.366e-04, 1.019e-03, -3.474e-06, 4.283e-04), r2);
	r0 = MulAdd(s0_0_1, M4(-3.992e-02, -6.421e-02, 4.307e-03, 1.187e-02, -7.608e-01, 6.976e-01, -6.667e-01, 8.908e-01, -3.944e-02, -8.276e-02, 1.536e-02, -1.292e-02, -5.983e-03, -1.775e-03, -1.282e-03, 7.502e-04), r0);
	r1 = MulAdd(s0_0_1, M4(-4.830e-02, -6.946e-02, 7.123e-03, 1.606e-02, -8.618e-01, 8.309e-01, -7.574e-01, 1.036e+00, -3.941e-02, -8.958e-02, 1.935e-02, -1.053e-02, -7.558e-03, -1.932e-03, -1.609e-03, 4.844e-04), r1);
	r2 = MulAdd(s0_0_1, M4(-4.456e-02, -6.237e-02, 5.041e-03, 1.126e-02, -7.481e-01, 6.377e-01, -6.635e-01, 8.172e-01, -4.129e-02, -8.276e-02, 1.358e-02, -1.372e-02, -6.299e-03, -2.010e-03, -1.687e-03, 2.659e-04), r2);
	r0 = MulAdd(s0_0_2, M4(3.703e-03, 1.179e-03, 3.107e-03, 4.348e-03, 2.804e-02, -5.522e-02, 2.359e-02, -5.566e-02, 3.707e-04, 2.911e-02, 8.372e-03, 1.910e-02, -4.598e-03, -9.775e-03, 6.452e-04, -1.145e-03), r0);
	r1 = MulAdd(s0_0_2, M4(4.414e-03, 1.612e-05, 3.171e-03, 3.536e-03, 2.914e-02, -5.064e-02, 2.239e-02, -5.424e-02, 3.163e-03, 3.273e-02, 1.077e-02, 2.167e-02, -5.247e-03, -1.211e-02, 6.086e-04, -1.601e-03), r1);
	r2 = MulAdd(s0_0_2, M4(3.500e-03, -2.817e-04, 3.494e-03, 3.908e-03, 2.956e-02, -2.233e-02, 2.606e-02, -1.932e-02, 1.336e-03, 2.499e-02, 8.369e-03, 1.667e-02, -4.012e-03, -9.841e-03, 2.841e-04, -1.843e-03), r2);
	r0 = MulAdd(s0_1_0, M4(1.008e-01, 4.577e-03, -7.055e-02, 2.019e-02, 5.614e-02, 2.519e-03, -2.627e-01, -2.882e-02, -8.862e-02, -4.675e-03, -3.626e-02, 8.864e-03, 4.395e-03, -6.619e-04, 1.986e-03, 1.650e-03), r0);
	r1 = MulAdd(s0_1_0, M4(9.444e-02, 6.098e-03, -7.592e-02, 1.679e-02, 6.104e-02, -2.131e-03, -3.096e-01, -2.977e-02, -9.937e-02, -1.554e-02, -4.496e-02, 8.086e-03, 5.083e-03, -7.676e-04, 7.065e-04, 8.884e-04), r1);
	r2 = MulAdd(s0_1_0, M4(9.022e-02, 6.493e-03, -6.323e-02, 1.332e-02, 6.139e-02, 4.356e-03, -2.551e-01, -2.324e-02, -7.836e-02, -4.196e-03, -3.674e-02, 9.689e-03, 4.955e-03, -6.679e-04, 2.364e-03, 1.732e-03), r2);
	r0 = MulAdd(s0_1_1, M4(1.538e-01, 4.307e-01, -1.421e-01, -2.085e-01, 5.068e-02, 4.837e-02, 6.968e-02, -1.483e-01, 2.323e-01, 2.680e-02, -2.589e-01, -2.510e-01, 1.335e-03, 1.150e-02, -2.658e-03, -2.414e-03), r0);
	r1 = MulAdd(s0_1_1, M4(1.743e-01, 4.692e-01, -1.675e-01, -2.310e-01, 6.077e-02, 4.596e-02, 8.325e-02, -1.821e-01, 2.576e-01, 3.570e-02, -2.959e-01, -2.920e-01, 1.184e-02, 1.346e-02, 1.658e-03, -9.485e-04), r1);
	r2 = MulAdd(s0_1_1, M4(1.577e-01, 4.092e-01, -1.421e-01, -1.864e-01, 4.994e-02, 4.639e-02, 6.809e-02, -1.450e-01, 2.202e-01, 2.917e-02, -2.512e-01, -2.476e-01, 5.449e-03, 1.135e-02, -2.520e-03, -2.284e-03), r2);
	r0 = MulAdd(s0_1_2, M4(-1.804e-03, 1.390e-02, 7.465e-03, -7.331e-03, -2.123e-04, 1.212e-02, -4.197e-03, 3.283e-02, -8.692e-03, 1.030e-01, 1.826e-02, 6.665e-02, -2.137e-02, -1.468e-02, -4.009e-03, -7.470e-04), r0);
	r1 = MulAdd(s0_1_2, M4(-3.520e-03, 1.617e-02, 8.337e-03, -1.309e-02, -2.454e-03, 1.347e-02, -5.347e-03, 3.899e-02, -6.166e-03, 1.274e-01, 2.432e-02, 8.911e-02, -1.367e-02, 1.270e-02, 3.665e-03, 8.815e-03), r1);
	r2 = MulAdd(s0_1_2, M4(-3.743e-04, 1.752e-02, 6.065e-03, -1.361e-02, -7.596e-04, 1.127e-02, -4.269e-03, 3.324e-02, -7.052e-03, 1.086e-01, 1.748e-02, 6.663e-02, -2.288e-02, -2.577e-03, -1.680e-03, 1.697e-03), r2);
	r0 = MulAdd(s0_2_0, M4(-2.350e-02, 7.393e-03, 9.124e-03, -4.576e-03, -1.075e-04, -4.959e-03, 8.548e-03, -5.415e-03, 1.297e-02, -2.476e-03, 1.705e-03, 7.202e-03, 1.666e-02, 4.862e-03, -9.146e-03, 4.264e-04), r0);
	r1 = MulAdd(s0_2_0, M4(-2.656e-02, 9.183e-03, 1.354e-02, 9.088e-04, -8.351e-04, -6.025e-03, 8.483e-03, -6.466e-03, 1.691e-02, -5.906e-03, -1.337e-03, 4.817e-03, 1.723e-02, 4.982e-03, -5.920e-03, 2.870e-03), r1);
	r2 = MulAdd(s0_2_0, M4(-2.313e-02, 6.913e-03, 2.799e-03, 4.028e-04, -1.641e-04, -4.999e-03, 9.373e-03, -5.403e-03, 1.419e-02, -2.830e-03, 2.302e-03, 7.463e-03, 1.680e-02, 4.594e-03, -4.857e-03, 4.035e-04), r2);
	r0 = MulAdd(s0_2_1, M4(-8.501e-03, -2.823e-02, -1.340e-02, -1.150e-01, -2.828e-03, 1.956e-03, 4.802e-03, 1.530e-02, -2.096e-03, 1.788e-02, 1.509e-01, 1.013e-01, 1.048e-01, 1.654e-02, -1.382e-01, 2.290e-02), r0);
	r1 = MulAdd(s0_2_1, M4(-5.767e-03, -2.253e-02, -8.881e-03, -1.440e-01, -3.423e-03, 1.925e-03, 6.432e-03, 1.563e-02, -3.652e-03, 2.054e-02, 1.763e-01, 1.185e-01, 9.847e-02, 1.466e-02, -1.592e-01, 2.633e-02), r1);
	r2 = MulAdd(s0_2_1, M4(-7.231e-03, -2.554e-02, -2.060e-02, -1.383e-01, -2.764e-03, 1.788e-03, 4.449e-03, 1.513e-02, -4.174e-03, 1.694e-02, 1.558e-01, 1.043e-01, 9.992e-02, 1.788e-02, -1.416e-01, 2.887e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-1.946e-03, -8.651e-03, -2.809e-03, -1.004e-02, 3.727e-04, 8.477e-04, 4.502e-04, 1.447e-03, 4.602e-04, 7.398e-04, 1.170e-03, 4.309e-02, 1.392e-01, 3.019e-01, -8.033e-02, -3.545e-01), r0);
	r1 = MulAdd(s0_2_2, M4(-4.735e-03, -1.034e-02, -2.635e-03, -1.905e-03, 7.731e-04, 1.359e-03, 4.981e-04, 2.005e-03, 2.759e-04, -2.357e-04, 4.703e-04, 4.751e-02, 1.499e-01, 3.115e-01, -8.862e-02, -4.145e-01), r1);
	r2 = MulAdd(s0_2_2, M4(-4.023e-03, -1.004e-02, -8.990e-04, -3.862e-03, 5.208e-04, 9.580e-04, 5.693e-04, 1.110e-03, 5.275e-04, -5.177e-04, 9.777e-04, 4.282e-02, 1.431e-01, 2.893e-01, -8.835e-02, -3.643e-01), r2);
	r0 = MulAdd(s1_0_0, M4(3.699e-02, 1.019e-02, 4.451e-03, 1.568e-04, 2.361e-02, 6.691e-03, 5.265e-03, 3.240e-03, 3.023e-03, 1.199e-03, 6.036e-05, 9.041e-04, -1.472e-03, -3.191e-03, 1.514e-03, 1.591e-03), r0);
	r1 = MulAdd(s1_0_0, M4(4.493e-02, 1.851e-02, 1.114e-02, 5.517e-03, 2.612e-02, 9.021e-03, 9.008e-03, 3.916e-03, 1.868e-03, 1.220e-04, -4.857e-04, -4.515e-04, -2.981e-03, -3.939e-03, 1.548e-03, 1.343e-03), r1);
	r2 = MulAdd(s1_0_0, M4(3.164e-02, 1.005e-02, 6.323e-03, -7.371e-04, 2.064e-02, 5.972e-03, 5.835e-03, 2.178e-03, 2.860e-03, 1.358e-03, 4.842e-04, 8.370e-04, -3.012e-03, -3.569e-03, 1.518e-03, 1.450e-03), r2);
	r0 = MulAdd(s1_0_1, M4(5.362e-02, 7.154e-02, 6.589e-03, -2.561e-03, 2.987e-02, 3.260e-02, -5.699e-03, 1.944e-02, 2.236e-03, -2.575e-03, -8.794e-04, -3.640e-03, -2.393e-03, -1.028e-02, -7.914e-04, 6.286e-04), r0);
	r1 = MulAdd(s1_0_1, M4(6.860e-02, 8.872e-02, 7.793e-03, 1.309e-03, 3.626e-02, 3.966e-02, -7.285e-03, 2.181e-02, 1.045e-03, -1.423e-03, -3.512e-03, -2.537e-03, 2.663e-04, -1.057e-02, -5.687e-04, 1.702e-03), r1);
	r2 = MulAdd(s1_0_1, M4(5.140e-02, 6.667e-02, 3.783e-03, -4.005e-03, 3.333e-02, 2.866e-02, -7.582e-03, 1.785e-02, 4.147e-03, -1.200e-04, -7.421e-04, -1.279e-03, -1.134e-03, -1.110e-02, -7.468e-04, 3.063e-04), r2);
	r0 = MulAdd(s1_0_2, M4(-2.010e-04, 2.109e-02, -2.675e-03, -6.665e-04, -6.875e-03, -1.382e-02, -7.842e-03, -4.983e-03, -2.642e-03, -1.313e-03, 1.126e-03, 2.494e-03, 6.408e-04, 1.212e-02, 1.682e-03, 5.269e-03), r0);
	r1 = MulAdd(s1_0_2, M4(-1.227e-05, 2.351e-02, -2.367e-03, -1.070e-04, -7.971e-03, -1.886e-02, -7.721e-03, -6.554e-03, -3.483e-03, -2.866e-03, 9.008e-04, 1.274e-03, 2.869e-04, 1.465e-02, 1.450e-03, 6.718e-03), r1);
	r2 = MulAdd(s1_0_2, M4(-1.025e-03, 1.837e-02, -3.061e-03, -2.219e-03, -8.161e-03, -1.570e-02, -7.552e-03, -4.564e-03, -2.413e-03, -1.546e-03, 8.286e-04, 2.015e-03, 8.081e-05, 1.257e-02, 1.601e-03, 6.290e-03), r2);
	r0 = MulAdd(s1_1_0, M4(-1.214e-02, 9.746e-03, 2.259e-02, -1.348e-02, -4.306e-02, -1.228e-03, -1.278e-02, 9.610e-03, -1.461e-02, -1.866e-03, 1.858e-03, 3.871e-03, -1.410e-02, -1.062e-03, -7.953e-03, -9.791e-03), r0);
	r1 = MulAdd(s1_1_0, M4(-1.629e-02, 1.088e-02, 1.204e-02, -1.724e-02, -5.467e-02, -3.430e-03, -2.119e-02, 8.757e-03, -9.366e-03, -1.007e-03, -9.616e-04, 6.188e-03, -9.946e-03, 5.755e-07, -1.035e-02, -8.145e-03), r1);
	r2 = MulAdd(s1_1_0, M4(-8.207e-03, 1.126e-02, 1.683e-02, -1.341e-02, -3.943e-02, -1.923e-03, -1.478e-02, 9.920e-03, -9.692e-03, -2.449e-03, -2.599e-03, 5.097e-03, -9.923e-03, -6.698e-04, -1.099e-02, -8.542e-03), r2);
	r0 = MulAdd(s1_1_1, M4(1.443e-01, -3.565e-01, 1.755e-01, 1.440e-01, -6.231e-01, -1.215e-02, 2.690e-01, 4.881e-02, 1.047e-01, -1.422e-01, -2.100e-04, -5.325e-03, 4.654e-02, 1.040e-02, 3.621e-02, -5.114e-02), r0);
	r1 = MulAdd(s1_1_1, M4(1.563e-01, -4.246e-01, 1.912e-01, 1.534e-01, -7.352e-01, -1.410e-02, 3.076e-01, 4.236e-02, 1.092e-01, -1.491e-01, 6.915e-03, -1.249e-03, 5.713e-02, 2.214e-02, 5.720e-02, -5.684e-02), r1);
	r2 = MulAdd(s1_1_1, M4(1.493e-01, -3.476e-01, 1.705e-01, 1.374e-01, -6.195e-01, -1.157e-02, 2.627e-01, 5.116e-02, 9.502e-02, -1.403e-01, 5.878e-03, -4.175e-03, 3.577e-02, 1.536e-02, 4.456e-02, -5.409e-02), r2);
	r0 = MulAdd(s1_1_2, M4(-2.273e-03, 3.137e-02, 1.358e-03, 3.972e-02, -3.122e-02, 8.656e-02, -2.380e-02, 2.296e-02, 2.186e-03, 3.481e-02, -4.925e-03, 6.679e-03, 5.046e-03, 1.160e-01, 9.720e-03, 5.356e-02), r0);
	r1 = MulAdd(s1_1_2, M4(-1.003e-03, 4.067e-02, 8.904e-04, 4.643e-02, -2.972e-02, 1.121e-01, -2.252e-02, 1.428e-02, 1.522e-03, 3.796e-02, -5.840e-03, 5.198e-03, 6.184e-03, 1.284e-01, 1.085e-02, 6.170e-02), r1);
	r2 = MulAdd(s1_1_2, M4(-2.015e-03, 3.430e-02, 5.492e-04, 3.800e-02, -2.597e-02, 8.666e-02, -2.203e-02, 1.854e-02, 1.681e-03, 3.504e-02, -3.794e-03, 7.081e-03, 5.565e-03, 1.126e-01, 7.864e-03, 5.137e-02), r2);
	r0 = MulAdd(s1_2_0, M4(-3.492e-02, -5.729e-04, 2.431e-03, -8.578e-03, -1.024e-02, -1.131e-03, -1.925e-02, 1.122e-02, -1.240e-03, 8.150e-03, -3.183e-02, 6.010e-03, 2.643e-02, 1.639e-03, 1.094e-02, -7.437e-03), r0);
	r1 = MulAdd(s1_2_0, M4(-4.866e-02, -1.909e-03, 1.522e-03, -1.473e-02, -9.927e-03, -1.824e-03, -2.362e-02, 1.504e-02, 4.972e-03, 1.537e-02, -1.317e-02, 1.043e-02, 2.846e-02, 2.758e-03, 1.793e-02, -1.032e-02), r1);
	r2 = MulAdd(s1_2_0, M4(-3.369e-02, -2.609e-03, 7.507e-03, -7.675e-03, -9.010e-03, -1.906e-03, -2.110e-02, 1.210e-02, -2.689e-03, 9.044e-03, -1.861e-02, 5.095e-03, 2.316e-02, 2.623e-03, 1.714e-02, -9.116e-03), r2);
	r0 = MulAdd(s1_2_1, M4(2.394e-03, -2.946e-03, 4.553e-02, -3.623e-01, -1.972e-02, 5.105e-03, 2.006e-01, 2.101e-02, 1.392e-01, -1.755e-01, 4.111e-01, -3.736e-01, 8.230e-02, -1.248e-02, -6.309e-01, -8.654e-04), r0);
	r1 = MulAdd(s1_2_1, M4(3.570e-03, 5.169e-03, 6.323e-02, -4.038e-01, -2.067e-02, 1.017e-02, 2.428e-01, 1.708e-02, 1.567e-01, -1.984e-01, 4.465e-01, -4.209e-01, 1.060e-01, -3.045e-02, -7.402e-01, 1.050e-02), r1);
	r2 = MulAdd(s1_2_1, M4(9.376e-04, 9.891e-04, 5.922e-02, -3.447e-01, -1.788e-02, 3.540e-03, 2.046e-01, 1.996e-02, 1.444e-01, -1.753e-01, 3.847e-01, -3.721e-01, 8.862e-02, -1.138e-02, -6.349e-01, 4.477e-03), r2);
	r0 = MulAdd(s1_2_2, M4(1.602e-03, 8.971e-04, 1.997e-03, 1.483e-02, 4.806e-03, -1.509e-02, -1.229e-02, 6.041e-02, 3.147e-04, 1.954e-02, -1.563e-02, 4.765e-02, 1.922e-02, 9.155e-02, 1.873e-02, 1.422e-01), r0);
	r1 = MulAdd(s1_2_2, M4(1.766e-03, -3.791e-04, 3.632e-03, 1.887e-02, 2.200e-03, -1.849e-02, -1.580e-02, 6.181e-02, -3.152e-03, 1.569e-02, -2.058e-02, 4.651e-02, 2.230e-02, 9.302e-02, 2.147e-02, 1.667e-01), r1);
	r2 = MulAdd(s1_2_2, M4(9.602e-04, -1.021e-03, 2.009e-03, 1.653e-02, 2.762e-03, -1.558e-02, -1.187e-02, 6.243e-02, -1.064e-03, 1.970e-02, -1.622e-02, 4.965e-02, 1.923e-02, 8.763e-02, 2.017e-02, 1.382e-01), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-9.605e-04, 3.187e-04, -2.777e-03, 9.179e-04, -3.742e-03, 1.297e-03, -1.851e-03, -1.778e-04, 1.613e-02, -3.156e-03, 7.123e-04, 8.806e-04, 3.741e-03, 2.432e-03, -7.548e-04, -2.137e-03), r0);
	r1 = MulAdd(s0_0_0, M4(-1.082e-03, 7.413e-04, -2.314e-03, 1.067e-03, -5.137e-03, 1.841e-04, -2.835e-03, 1.435e-04, 1.867e-02, -2.501e-03, -1.164e-03, 1.374e-04, 4.826e-03, 2.183e-03, -1.799e-03, -2.041e-03), r1);
	r2 = MulAdd(s0_0_0, M4(-1.094e-03, 2.717e-04, -1.934e-03, 1.155e-03, -3.450e-03, 1.320e-03, -2.519e-03, -7.322e-06, 1.525e-02, -2.914e-03, -6.143e-04, 6.135e-04, 4.591e-03, 2.138e-03, -8.323e-04, -1.741e-03), r2);
	r0 = MulAdd(s0_0_1, M4(2.377e-02, 5.519e-03, 3.712e-03, 1.882e-03, -5.877e-02, -1.886e-02, -2.205e-02, -4.582e-03, -7.355e-03, 5.151e-02, -9.632e-03, -1.430e-03, 3.870e-02, 9.101e-03, 6.861e-03, -8.329e-03), r0);
	r1 = MulAdd(s0_0_1, M4(2.169e-02, 4.412e-03, 2.792e-03, 4.134e-03, -7.544e-02, -3.552e-02, -3.406e-02, -1.502e-02, -1.491e-02, 4.260e-02, -1.681e-02, -1.247e-02, 4.630e-02, 1.240e-02, 8.661e-03, -7.002e-03), r1);
	r2 = MulAdd(s0_0_1, M4(2.471e-02, 5.937e-03, 2.249e-03, 2.579e-03, -5.213e-02, -1.984e-02, -2.068e-02, -3.657e-03, 2.216e-03, 5.304e-02, -8.160e-03, 5.389e-04, 3.772e-02, 1.047e-02, 5.538e-03, -8.164e-03), r2);
	r0 = MulAdd(s0_0_2, M4(-9.107e-02, 8.277e-02, 1.235e-03, -2.504e-02, -1.967e-02, -1.179e-01, -1.070e-02, -3.553e-02, -9.516e-04, -3.406e-02, 9.499e-04, -8.692e-03, 2.954e-02, 7.153e-02, 2.805e-04, 1.376e-03), r0);
	r1 = MulAdd(s0_0_2, M4(-9.399e-02, 8.765e-02, 5.600e-03, -2.776e-02, -2.545e-02, -1.304e-01, -1.360e-02, -3.821e-02, -3.120e-03, -4.041e-02, 2.827e-04, -1.205e-02, 2.704e-02, 7.905e-02, -1.669e-03, -5.553e-04), r1);
	r2 = MulAdd(s0_0_2, M4(-9.427e-02, 7.838e-02, 2.175e-03, -2.236e-02, -2.165e-02, -1.116e-01, -1.132e-02, -3.160e-02, 3.352e-04, -2.936e-02, 1.574e-03, -8.084e-03, 2.967e-02, 7.897e-02, 2.459e-03, 2.792e-03), r2);
	r0 = MulAdd(s0_1_0, M4(1.801e-03, 3.843e-03, 2.226e-03, -2.899e-05, 3.317e-03, -8.219e-04, -1.042e-03, 3.710e-03, 8.677e-02, 5.729e-03, 6.815e-02, -3.281e-03, 8.378e-04, -9.263e-04, 1.082e-02, 2.607e-03), r0);
	r1 = MulAdd(s0_1_0, M4(1.631e-03, 4.237e-03, 1.357e-03, -2.321e-04, 4.080e-03, -2.684e-03, -2.024e-04, 3.159e-03, 1.150e-01, 1.598e-02, 8.966e-02, 3.635e-03, 2.314e-03, 3.678e-05, 1.138e-02, 1.779e-03), r1);
	r2 = MulAdd(s0_1_0, M4(1.445e-03, 3.394e-03, 1.036e-03, -2.586e-04, 3.231e-03, -8.687e-04, -1.729e-04, 3.700e-03, 8.402e-02, 2.972e-03, 6.694e-02, -3.634e-03, 1.626e-03, -9.405e-05, 1.128e-02, 1.713e-03), r2);
	r0 = MulAdd(s0_1_1, M4(2.352e-03, -1.067e-02, 2.386e-02, -3.439e-03, 2.920e-01, 9.905e-03, 1.598e-01, 3.137e-02, -1.968e-01, 2.144e-01, -4.343e-02, 2.862e-01, -4.507e-02, -2.640e-02, 1.026e-02, 1.537e-02), r0);
	r1 = MulAdd(s0_1_1, M4(-8.069e-03, -1.849e-02, 1.562e-02, -1.206e-02, 3.408e-01, 2.048e-02, 1.942e-01, 4.726e-02, -2.193e-01, 2.568e-01, -4.797e-02, 3.332e-01, -4.485e-02, -2.667e-02, 9.611e-03, 1.233e-02), r1);
	r2 = MulAdd(s0_1_1, M4(6.187e-03, -1.053e-02, 3.022e-02, -4.186e-03, 2.819e-01, 1.290e-02, 1.567e-01, 3.211e-02, -1.978e-01, 2.095e-01, -3.976e-02, 2.805e-01, -4.826e-02, -2.224e-02, 1.528e-02, 1.420e-02), r2);
	r0 = MulAdd(s0_1_2, M4(-2.209e-01, 1.938e-01, -2.417e-01, 2.800e-01, -5.486e-02, -1.284e-02, -7.747e-02, -2.378e-01, 3.403e-03, -1.121e-01, -2.645e-03, -9.153e-02, 1.500e-02, -5.332e-01, 2.185e-02, 1.489e-01), r0);
	r1 = MulAdd(s0_1_2, M4(-2.512e-01, 2.105e-01, -2.705e-01, 3.057e-01, -5.848e-02, -6.765e-03, -8.130e-02, -2.646e-01, 1.793e-03, -1.287e-01, -4.513e-03, -1.038e-01, 3.720e-02, -6.191e-01, 1.878e-02, 1.846e-01), r1);
	r2 = MulAdd(s0_1_2, M4(-2.192e-01, 1.880e-01, -2.437e-01, 2.666e-01, -5.537e-02, -2.403e-02, -7.352e-02, -2.388e-01, 1.836e-03, -1.086e-01, -9.311e-04, -8.276e-02, 1.192e-02, -5.449e-01, 1.973e-02, 1.500e-01), r2);
	r0 = MulAdd(s0_2_0, M4(-4.242e-03, -6.539e-04, -3.970e-03, 9.940e-04, 4.864e-04, 1.229e-03, 3.242e-03, -4.086e-04, -1.896e-02, -8.691e-04, 4.481e-03, -7.520e-03, -1.831e-03, 1.143e-03, 1.172e-03, 6.587e-04), r0);
	r1 = MulAdd(s0_2_0, M4(-4.043e-03, -4.469e-04, -3.858e-03, 1.086e-03, -7.299e-04, 9.388e-04, 1.606e-03, -1.979e-03, -2.128e-02, 2.998e-03, 1.474e-02, -4.500e-03, -3.346e-03, 3.670e-04, 2.095e-03, 4.531e-04), r1);
	r2 = MulAdd(s0_2_0, M4(-4.037e-03, -3.464e-04, -3.521e-03, 8.718e-04, 3.148e-04, 1.409e-03, 2.912e-03, -5.722e-04, -2.105e-02, -6.306e-04, 3.146e-03, -9.371e-03, -1.826e-03, 6.686e-04, 4.236e-03, 1.297e-03), r2);
	r0 = MulAdd(s0_2_1, M4(1.298e-02, -1.529e-03, 1.383e-02, -5.082e-03, 1.429e-03, 2.406e-03, 8.127e-02, -1.016e-02, -1.207e-02, -2.703e-02, -9.055e-02, -2.931e-02, 2.207e-02, 2.737e-03, 7.985e-02, -1.727e-03), r0);
	r1 = MulAdd(s0_2_1, M4(1.079e-02, -3.039e-03, 6.023e-03, -7.776e-03, -1.661e-03, 3.237e-03, 8.561e-02, -1.351e-02, -1.552e-02, -3.458e-02, -1.018e-01, -3.458e-02, 2.705e-02, 4.032e-03, 8.152e-02, 7.414e-04), r1);
	r2 = MulAdd(s0_2_1, M4(1.257e-02, -1.513e-03, 1.332e-02, -3.766e-03, 7.344e-04, 2.334e-03, 7.690e-02, -1.066e-02, -1.161e-02, -2.786e-02, -9.106e-02, -2.959e-02, 2.115e-02, 4.214e-03, 7.223e-02, 1.279e-03), r2);
	r0 = MulAdd(s0_2_2, M4(-4.658e-03, 6.083e-03, -3.345e-02, 5.579e-02, 2.342e-04, -1.510e-02, -1.010e-02, 8.912e-02, -3.565e-03, -4.545e-03, -1.685e-03, -4.431e-02, -7.475e-03, -1.070e-03, -3.960e-03, 1.408e-01), r0);
	r1 = MulAdd(s0_2_2, M4(-4.120e-04, 5.999e-03, -3.439e-02, 5.728e-02, 1.961e-03, -1.437e-02, -1.581e-02, 1.037e-01, -3.598e-03, -3.322e-03, -2.977e-03, -5.209e-02, -6.548e-03, 2.613e-03, -1.239e-02, 1.596e-01), r1);
	r2 = MulAdd(s0_2_2, M4(-1.046e-03, 7.980e-03, -3.126e-02, 5.707e-02, 2.107e-03, -1.326e-02, -1.569e-02, 8.278e-02, -2.450e-03, -2.378e-03, -2.741e-03, -4.382e-02, -4.875e-03, 3.715e-03, -9.835e-03, 1.275e-01), r2);
	float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
	OUTPUT[gxy + int2(0, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb + MF3(r0.x, r1.x, r2.x)), 1.0);
	OUTPUT[gxy + int2(1, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb + MF3(r0.y, r1.y, r2.y)), 1.0);
	OUTPUT[gxy + int2(0, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb + MF3(r0.z, r1.z, r2.z)), 1.0);
	OUTPUT[gxy + int2(1, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb + MF3(r0.w, r1.w, r2.w)), 1.0);
}
