// CuNNy 6x8C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D08N06
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1

#define l0(x, y) (dot(MF3(2.668e-01, 5.128e-01, 1.094e-01), O(INPUT, float2(x, y)).rgb) + MF(-8.262e-01))

V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { 2.110e-03, -3.283e-04, -1.730e-02, -4.942e-03 };
	r = mad(s0_0, V4(3.453e-03, -2.009e-02, 1.029e-02, -4.229e-02), r);
	r = mad(s0_1, V4(-1.162e-01, -3.046e-01, 3.944e-01, -3.426e-02), r);
	r = mad(s0_2, V4(4.098e-02, 2.644e-01, 7.739e-03, 1.740e-02), r);
	r = mad(s0_3, V4(-5.645e-01, 3.510e-01, -1.381e-01, 3.015e-01), r);
	r = mad(s0_4, V4(5.692e-01, 2.277e-02, -1.002e-02, -2.280e-01), r);
	r = mad(s0_5, V4(1.874e-02, -2.802e-01, -2.867e-02, -9.688e-02), r);
	r = mad(s0_6, V4(3.162e-02, -3.273e-01, -1.823e-03, 2.738e-02), r);
	r = mad(s0_7, V4(9.148e-02, 3.054e-01, 4.438e-02, 2.153e-01), r);
	r = mad(s0_8, V4(-7.020e-02, -1.003e-02, 3.250e-03, -1.590e-01), r);
	return r;
}

V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
	V4 r = { 3.018e-03, 1.376e-02, -1.274e-02, 5.625e-03 };
	r = mad(s0_0, V4(4.130e-03, -1.028e-02, -7.847e-02, -1.277e-01), r);
	r = mad(s0_1, V4(1.850e-02, 1.097e-02, -1.303e-01, 1.772e-01), r);
	r = mad(s0_2, V4(-3.426e-02, 3.168e-01, 9.948e-03, -4.789e-02), r);
	r = mad(s0_3, V4(1.225e-03, 1.298e-02, 2.803e-01, -1.551e-02), r);
	r = mad(s0_4, V4(-1.303e-02, -1.703e-02, 1.588e-01, 2.124e-01), r);
	r = mad(s0_5, V4(5.410e-01, -2.998e-01, 2.618e-02, -1.909e-01), r);
	r = mad(s0_6, V4(-1.382e-03, 4.087e-03, 1.879e-01, 4.719e-02), r);
	r = mad(s0_7, V4(-3.849e-03, 1.123e-02, -2.463e-01, -5.789e-02), r);
	r = mad(s0_8, V4(-5.129e-01, -2.681e-02, 3.530e-02, 7.564e-03), r);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	MF s0_0 = l0(-1.0, -1.0);
	MF s0_1 = l0(0.0, -1.0);
	MF s0_2 = l0(1.0, -1.0);
	MF s0_3 = l0(-1.0, 0.0);
	MF s0_4 = l0(0.0, 0.0);
	MF s0_5 = l0(1.0, 0.0);
	MF s0_6 = l0(-1.0, 1.0);
	MF s0_7 = l0(0.0, 1.0);
	MF s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 6.864e-03, -7.720e-03, -1.170e-02, 2.714e-03 };
	r = MulAdd(s0_0, M4(1.042e-01, -4.946e-02, 6.554e-02, -8.392e-03, 1.283e-01, 1.651e-01, 3.683e-04, -1.051e-01, 4.040e-02, -6.936e-02, 1.557e-01, -1.129e-01, 2.308e-01, -4.099e-02, -5.587e-02, 5.584e-02), r);
	r = MulAdd(s0_1, M4(-6.587e-02, -5.729e-02, 2.461e-01, -1.196e-02, -4.689e-02, 1.865e-01, 1.570e-02, 3.911e-03, -1.411e-01, -3.073e-02, 4.971e-02, -2.336e-01, 1.874e-01, -9.632e-02, 1.814e-01, -3.548e-01), r);
	r = MulAdd(s0_2, M4(1.724e-01, 2.876e-01, 1.552e-01, -2.105e-01, 1.037e-01, 3.116e-01, 1.832e-01, -8.618e-02, 6.401e-02, 1.319e-01, 3.862e-02, 2.301e-01, -1.265e-02, -2.864e-01, -3.199e-02, 2.311e-01), r);
	r = MulAdd(s0_3, M4(4.153e-02, -5.609e-02, -1.424e-01, 3.368e-02, 2.271e-01, -8.217e-02, -8.373e-02, -2.666e-01, -3.089e-01, -3.909e-02, -3.284e-02, -1.384e-01, 1.265e-02, 2.006e-01, -5.720e-02, 1.076e-01), r);
	r = MulAdd(s0_4, M4(-4.261e-02, 1.184e-01, 3.302e-01, -2.949e-01, -3.369e-02, 3.576e-02, 1.019e-01, 3.097e-01, -4.891e-02, -2.452e-01, 5.598e-02, -2.664e-01, 6.785e-02, -1.832e-02, -6.347e-01, 2.930e-01), r);
	r = MulAdd(s0_5, M4(5.791e-01, -1.328e-01, -2.570e-01, 2.606e-01, 8.937e-02, -2.239e-01, 6.957e-02, 1.786e-02, 2.350e-01, -4.333e-03, 9.579e-02, 2.193e-02, -2.945e-02, 1.651e-01, -8.800e-02, 3.369e-01), r);
	r = MulAdd(s0_6, M4(-1.784e-01, -1.200e-01, -1.743e-01, 1.992e-01, -3.070e-02, -1.489e-01, -1.489e-01, 4.937e-03, -5.589e-01, -5.703e-01, 5.314e-01, -5.588e-01, -2.519e-02, 2.655e-03, -3.409e-02, 2.062e-01), r);
	r = MulAdd(s0_7, M4(-4.764e-02, -2.769e-01, -8.199e-01, 6.832e-02, -1.419e-02, -5.326e-02, -2.143e-01, -6.217e-03, -2.033e-01, 9.656e-02, 1.715e-01, -2.919e-01, 1.069e-01, 2.729e-01, -8.952e-04, 1.155e-01), r);
	r = MulAdd(s0_8, M4(1.953e-02, -3.145e-01, 1.220e-02, -1.279e-01, -1.510e-01, -1.802e-01, -9.842e-02, 1.264e-02, -2.215e-01, 2.586e-02, 4.424e-01, 6.816e-01, -7.124e-02, 1.132e-01, 4.702e-02, 4.041e-01), r);
	r = MulAdd(s1_0, M4(9.269e-02, 1.047e-02, 5.079e-02, 5.864e-02, -3.747e-02, 1.270e-01, 1.255e-01, -1.602e-01, 7.921e-02, 5.447e-03, 1.268e-01, -4.616e-02, 2.253e-01, -9.203e-03, 6.893e-04, -8.571e-02), r);
	r = MulAdd(s1_1, M4(-1.425e-01, 3.062e-02, 2.404e-01, -3.486e-01, -4.895e-01, -1.163e-01, 1.275e-01, -2.587e-01, -4.745e-02, 5.669e-02, 7.794e-02, -1.012e-01, 8.594e-02, -4.235e-01, 3.494e-01, -3.638e-01), r);
	r = MulAdd(s1_2, M4(-2.751e-01, -4.543e-01, 2.819e-01, -3.893e-01, -8.049e-03, 2.251e-01, 1.453e-01, -2.370e-01, 3.627e-02, 2.143e-01, 1.543e-02, 1.223e-01, 6.544e-02, -2.368e-01, 9.235e-03, 2.656e-01), r);
	r = MulAdd(s1_3, M4(1.210e-01, 9.555e-02, -5.417e-02, 1.165e-01, -1.068e-01, 6.166e-02, -2.891e-02, -8.389e-02, -1.224e-01, 5.482e-02, 2.476e-02, -8.222e-03, -1.536e-01, 6.750e-02, -1.824e-01, 1.234e-01), r);
	r = MulAdd(s1_4, M4(-2.644e-02, 1.267e-01, 4.029e-01, 2.617e-01, -4.951e-01, 1.263e-01, 1.485e-01, 5.693e-01, -8.725e-03, 2.043e-01, -4.077e-03, -1.607e-01, -2.110e-01, -4.477e-02, -5.539e-01, 3.658e-01), r);
	r = MulAdd(s1_5, M4(2.017e-01, 6.315e-01, -2.568e-01, -6.606e-01, 4.404e-01, -4.315e-01, -2.281e-02, -3.790e-01, 6.808e-02, -3.220e-02, -7.074e-02, -8.168e-02, -3.823e-01, 1.807e-01, 9.908e-02, -7.475e-02), r);
	r = MulAdd(s1_6, M4(-1.382e-01, -2.290e-01, -1.867e-01, 2.769e-01, -6.167e-02, -1.228e-01, -1.397e-01, 1.529e-01, -1.591e-02, -4.457e-02, 2.053e-01, 2.664e-02, -9.287e-02, 3.469e-02, 8.066e-02, -5.675e-02), r);
	r = MulAdd(s1_7, M4(3.744e-01, -3.467e-01, -9.004e-01, 5.327e-01, 5.426e-02, -6.985e-02, -3.799e-01, 4.876e-01, -6.079e-02, 2.228e-03, 2.478e-02, -2.092e-01, 2.800e-02, 2.458e-01, -1.370e-02, -1.551e-01), r);
	r = MulAdd(s1_8, M4(2.552e-01, 1.097e-01, -1.182e-01, 1.627e-01, 2.166e-01, -4.759e-03, -1.014e-01, 2.126e-02, 6.229e-02, 2.329e-02, 9.464e-02, 4.757e-02, 3.131e-03, -2.587e-03, -2.741e-02, -4.537e-02), r);
	r = MulAdd(s2_0, M4(1.745e-01, 4.940e-01, 2.727e-02, -8.500e-02, -2.356e-01, -1.500e-01, 9.724e-03, -2.077e-01, -5.020e-02, -1.467e-01, 1.685e-01, 3.359e-01, 2.094e-01, 1.175e-02, -6.380e-02, 4.710e-02), r);
	r = MulAdd(s2_1, M4(-3.113e-02, 8.027e-01, -4.982e-03, -1.428e-01, 6.573e-02, 4.074e-04, 1.637e-01, 1.466e-01, 3.238e-01, 2.433e-01, -3.147e-01, -1.136e-02, -3.248e-02, -1.969e-01, 9.390e-02, -2.485e-01), r);
	r = MulAdd(s2_2, M4(-1.045e-01, 2.494e-01, -3.734e-02, 4.765e-02, 5.332e-03, -2.832e-01, -2.026e-01, -1.339e-02, -4.321e-01, -7.626e-01, 1.908e-01, 3.502e-01, 3.451e-01, 1.199e-01, -7.371e-02, -6.362e-02), r);
	r = MulAdd(s2_3, M4(-1.629e-01, -4.320e-01, 7.872e-01, -2.766e-01, -1.373e-01, 1.317e-02, -1.461e-01, 2.414e-02, -9.297e-02, 3.045e-02, -6.804e-02, 8.275e-02, -3.640e-01, -1.441e-01, -3.555e-02, 1.300e-01), r);
	r = MulAdd(s2_4, M4(-3.256e-01, -5.021e-01, 1.595e-01, 4.114e-01, 7.795e-02, 1.064e-01, -9.154e-02, 1.568e-01, -2.319e-01, -8.268e-02, 3.692e-01, -9.510e-03, -1.566e-01, 1.517e-01, 2.046e-01, -5.032e-02), r);
	r = MulAdd(s2_5, M4(-2.343e-01, -3.166e-01, -2.043e-04, -5.327e-02, -2.782e-02, -2.989e-01, -7.909e-02, 1.890e-01, -2.144e-01, 5.248e-01, -3.855e-02, 1.994e-01, -3.525e-01, -6.465e-02, -1.340e-02, 1.749e-01), r);
	r = MulAdd(s2_6, M4(2.183e-02, 1.909e-03, 3.262e-02, 7.862e-02, -3.753e-01, 2.833e-01, 4.159e-01, -2.270e-01, 2.925e-02, -2.160e-01, -1.200e-01, -3.268e-02, -1.949e-01, -1.118e-01, 8.766e-02, 2.260e-02), r);
	r = MulAdd(s2_7, M4(-1.560e-02, -3.559e-02, 3.894e-02, -7.496e-02, 1.947e-01, 4.921e-01, 2.227e-01, -2.002e-01, 3.045e-01, -1.339e-01, -5.773e-01, -2.032e-01, -2.834e-02, -1.129e-01, -2.371e-01, -6.408e-03), r);
	r = MulAdd(s2_8, M4(1.109e-02, -4.708e-02, -3.267e-02, -6.323e-02, 2.935e-01, 2.542e-01, 1.224e-01, -9.832e-02, -1.134e-02, -7.131e-02, -5.557e-02, 5.606e-01, 2.551e-01, 2.313e-02, -8.756e-02, 8.658e-02), r);
	r = MulAdd(s3_0, M4(-4.977e-01, 7.734e-02, 1.035e-01, -5.555e-01, -1.226e-01, -7.049e-02, -6.882e-02, -3.950e-02, -5.240e-03, -1.275e-01, 1.254e-01, 1.743e-01, 5.163e-02, 5.000e-03, -8.731e-02, -6.371e-02), r);
	r = MulAdd(s3_1, M4(-3.240e-02, -1.207e-01, 1.163e-01, -1.555e-01, 1.009e-01, -1.052e-01, 2.058e-01, 2.205e-01, 6.038e-02, 1.410e-01, -1.091e-01, -2.054e-02, -3.050e-01, -6.117e-03, 1.110e-01, -1.869e-01), r);
	r = MulAdd(s3_2, M4(1.899e-01, 9.263e-02, -4.842e-02, 2.578e-02, -1.349e-01, -1.889e-01, -1.932e-01, 2.278e-02, -1.531e-01, -1.199e-01, 2.886e-03, 3.222e-02, -2.776e-02, 1.405e-01, -6.920e-02, -5.137e-01), r);
	r = MulAdd(s3_3, M4(-4.189e-01, 1.880e-01, -1.234e-01, 7.659e-01, 5.997e-01, -1.043e-01, -1.229e-01, 6.046e-01, -1.916e-01, 5.904e-02, 2.624e-02, 1.703e-02, -1.891e-02, 3.916e-02, 1.597e-01, -1.509e-01), r);
	r = MulAdd(s3_4, M4(5.301e-01, -1.048e-01, -8.784e-02, -4.903e-01, 3.227e-01, -4.585e-01, -2.122e-01, 3.939e-01, -1.169e-01, -8.373e-03, 1.272e-01, 6.810e-02, 2.517e-01, 2.839e-01, 1.257e-01, -4.846e-01), r);
	r = MulAdd(s3_5, M4(1.499e-01, -2.604e-01, -1.691e-01, -3.895e-01, -4.073e-01, 3.564e-02, 8.605e-02, -2.012e-01, 4.062e-02, 4.141e-02, 1.726e-02, -9.629e-02, 1.348e-01, -9.468e-02, 2.113e-03, -7.229e-02), r);
	r = MulAdd(s3_6, M4(-5.457e-02, 1.150e-01, -1.297e-01, 2.096e-01, 4.355e-01, -6.530e-02, -6.438e-01, 6.335e-01, 5.057e-02, -2.750e-01, -2.055e-01, 8.634e-02, -5.581e-01, -1.185e-01, 2.945e-02, 1.703e-01), r);
	r = MulAdd(s3_7, M4(-1.032e-02, -4.045e-02, 9.848e-02, -1.865e-01, -1.078e-01, -4.990e-01, 4.652e-01, -1.320e-01, 1.969e-01, -1.816e-01, -4.724e-01, 1.471e-01, 5.619e-01, 4.145e-02, -3.606e-02, 8.608e-01), r);
	r = MulAdd(s3_8, M4(4.007e-02, -5.339e-02, -2.525e-04, 1.082e-01, -2.520e-01, 2.832e-01, 4.025e-01, 1.532e-01, 1.244e-01, -9.873e-02, -8.482e-02, 5.278e-02, 1.618e-01, -3.404e-02, 2.947e-02, 2.445e-01), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.499e-01, -1.776e-03, -6.774e-03, 1.322e-02 };
	r = MulAdd(s0_0, M4(-5.307e-02, -6.220e-02, 7.854e-02, 3.023e-02, 1.780e-02, -4.660e-02, 5.384e-02, -3.152e-02, 1.070e-01, -8.460e-02, 1.359e-01, -8.144e-02, -1.251e-01, -7.183e-02, -5.920e-02, -1.442e-01), r);
	r = MulAdd(s0_1, M4(-7.721e-02, 5.857e-02, 1.973e-01, 3.698e-02, -1.199e-01, -2.585e-02, 7.877e-02, -1.865e-01, -3.580e-01, 1.767e-01, -1.279e-02, -4.554e-02, -1.928e-01, 1.017e-01, 2.319e-01, -2.160e-01), r);
	r = MulAdd(s0_2, M4(-9.376e-02, -1.128e-01, -6.333e-02, 3.354e-02, -1.542e-02, 7.209e-02, 2.742e-01, 2.597e-02, 5.620e-02, -6.394e-02, -7.670e-03, 4.574e-02, -7.222e-02, 6.187e-02, -1.593e-01, 3.579e-03), r);
	r = MulAdd(s0_3, M4(5.476e-02, 8.972e-02, -2.571e-01, 2.037e-01, 7.994e-02, -7.436e-06, -1.929e-01, -8.885e-02, 2.996e-01, 3.133e-01, 1.697e-01, 2.396e-02, 2.580e-01, -8.171e-02, -5.660e-01, 1.743e-01), r);
	r = MulAdd(s0_4, M4(-7.670e-01, 4.858e-01, 1.888e-01, -3.512e-01, 1.555e-01, 1.685e-01, 1.297e-01, -1.428e-01, -1.589e-01, 6.807e-02, 2.787e-01, -7.520e-01, -3.137e-02, -2.188e-01, 4.388e-01, 1.228e-01), r);
	r = MulAdd(s0_5, M4(2.404e-01, -3.138e-01, 2.209e-01, 9.275e-02, 5.628e-02, 7.736e-02, -3.531e-01, -7.307e-02, 1.796e-01, -2.184e-01, 2.552e-01, 2.046e-01, -2.306e-01, 1.558e-01, -4.967e-02, -1.155e-01), r);
	r = MulAdd(s0_6, M4(1.131e-01, -2.256e-02, 1.024e-01, 1.899e-01, 1.293e-01, 2.068e-02, -1.174e-01, 1.143e-01, -5.395e-02, 4.318e-02, -9.846e-02, 1.123e-01, 4.766e-02, -2.608e-01, -1.273e-01, -2.761e-01), r);
	r = MulAdd(s0_7, M4(1.882e-01, -5.930e-02, -2.394e-01, 1.931e-01, 1.246e-01, -2.508e-02, -7.158e-02, -2.783e-02, -2.621e-01, 7.212e-01, -1.169e-01, -3.426e-01, -1.276e-01, 6.132e-02, 2.120e-01, -3.544e-01), r);
	r = MulAdd(s0_8, M4(1.605e-02, 4.498e-02, 3.280e-01, -9.840e-02, 1.066e-01, -2.167e-02, 1.566e-01, -6.055e-02, -7.933e-02, 2.419e-01, -6.969e-01, -1.372e-01, 6.544e-02, -4.269e-02, -3.820e-04, 1.499e-01), r);
	r = MulAdd(s1_0, M4(-8.681e-02, -1.087e-01, 9.044e-02, -1.616e-01, 5.700e-02, -1.112e-01, 7.944e-02, -1.029e-01, 2.576e-02, -8.748e-02, 1.241e-01, 7.166e-03, 2.636e-02, 2.446e-02, -4.103e-02, 1.675e-01), r);
	r = MulAdd(s1_1, M4(1.572e-02, 3.793e-02, 2.414e-01, -2.281e-01, -8.712e-02, -6.538e-02, 1.419e-01, -2.081e-01, -9.182e-02, 2.571e-02, 1.355e-01, -1.110e-01, -2.355e-01, 2.339e-01, 1.910e-02, -1.032e-01), r);
	r = MulAdd(s1_2, M4(1.797e-02, -6.616e-02, -1.663e-02, -1.947e-01, -3.057e-02, 5.054e-02, 1.722e-01, -1.862e-02, 7.327e-03, 2.610e-02, -3.037e-01, 1.290e-01, -5.838e-02, 2.218e-02, -5.471e-02, 6.622e-02), r);
	r = MulAdd(s1_3, M4(7.168e-03, 8.656e-02, -1.933e-01, 1.346e-01, 8.983e-04, -7.273e-03, -1.299e-01, 4.448e-02, -1.194e-02, 1.806e-01, 2.176e-01, 9.534e-02, 2.997e-01, -4.100e-02, -6.068e-01, 3.835e-01), r);
	r = MulAdd(s1_4, M4(-2.998e-01, 4.814e-01, 1.137e-01, -9.396e-01, -1.303e-01, 3.726e-02, 2.731e-01, 4.488e-01, -1.218e-01, 1.669e-01, 1.251e-01, -7.614e-02, 2.104e-01, -1.051e-01, 1.398e-01, 3.838e-01), r);
	r = MulAdd(s1_5, M4(4.201e-01, -1.504e-01, 1.123e-01, 1.165e-01, 7.321e-03, 1.057e-01, -5.330e-01, -1.028e-01, 6.962e-02, 1.858e-01, 1.840e-01, -1.049e-01, 4.850e-02, 2.241e-01, -2.765e-01, 2.054e-02), r);
	r = MulAdd(s1_6, M4(1.442e-01, -6.238e-02, 1.505e-01, -7.470e-02, 8.926e-02, -9.376e-03, 2.118e-02, 1.368e-01, 5.482e-02, 1.042e-01, -1.818e-01, 1.364e-01, 8.520e-02, -2.172e-01, -1.501e-01, -1.698e-01), r);
	r = MulAdd(s1_7, M4(2.163e-01, -6.139e-02, -2.190e-01, -1.932e-01, -4.070e-02, -5.123e-02, -9.987e-02, 1.412e-01, -1.614e-01, 6.292e-01, -1.916e-01, -8.262e-02, -2.486e-01, 8.627e-03, 2.816e-01, -2.516e-02), r);
	r = MulAdd(s1_8, M4(-4.631e-02, -2.598e-02, -8.044e-02, -1.965e-01, 9.235e-02, -2.282e-02, -9.376e-02, 1.773e-02, -9.385e-02, 2.837e-01, -5.315e-01, 1.079e-01, 3.615e-02, -3.025e-02, -8.471e-02, 1.044e-01), r);
	r = MulAdd(s2_0, M4(1.066e-01, 7.468e-02, 1.383e-01, -2.494e-01, -5.715e-02, 1.462e-01, -4.641e-02, -6.538e-02, -3.077e-02, 6.170e-02, 2.492e-01, 1.307e-01, 1.527e-01, 4.648e-02, -1.311e-01, 1.123e-02), r);
	r = MulAdd(s2_1, M4(-2.184e-02, -8.723e-02, -6.092e-02, 3.431e-03, 9.186e-02, 5.499e-03, 2.798e-01, -1.613e-02, 2.447e-01, 2.809e-02, -3.777e-02, 4.224e-02, 9.632e-02, -3.813e-02, -6.021e-02, -5.802e-02), r);
	r = MulAdd(s2_2, M4(7.639e-02, 1.966e-02, 2.578e-01, 1.208e-01, 2.502e-02, -3.256e-02, -1.851e-01, -2.343e-02, -2.017e-01, -5.456e-02, -1.051e-01, -1.624e-01, 2.899e-02, -1.224e-03, 3.144e-01, 1.033e-01), r);
	r = MulAdd(s2_3, M4(-1.768e-01, 1.433e-01, 1.390e-01, -1.494e-01, -3.681e-02, -1.480e-01, -3.328e-01, 4.138e-02, -1.058e-01, 2.190e-01, 3.278e-01, -2.495e-01, 8.836e-02, 1.228e-01, -3.728e-01, 3.534e-01), r);
	r = MulAdd(s2_4, M4(5.058e-02, -9.077e-02, -7.532e-02, 9.344e-02, 1.096e-01, 5.318e-02, -2.979e-01, 1.779e-01, -1.682e-01, 3.371e-01, 1.964e-01, -2.526e-01, -3.013e-01, 5.184e-02, 3.613e-01, 5.832e-02), r);
	r = MulAdd(s2_5, M4(-4.613e-02, 3.121e-02, 5.499e-02, 7.645e-02, -1.745e-01, -4.111e-02, 3.210e-01, -2.519e-02, -8.660e-02, 4.306e-01, 5.642e-01, -1.645e-01, -6.127e-02, 3.565e-02, 3.529e-02, 5.166e-02), r);
	r = MulAdd(s2_6, M4(-1.482e-01, 3.199e-02, -1.302e-02, 7.168e-02, 2.378e-01, -7.776e-02, 2.090e-01, 1.870e-01, 1.820e-01, 8.673e-02, -9.481e-05, 2.911e-01, 6.907e-02, -4.005e-02, -1.265e-01, -1.030e-01), r);
	r = MulAdd(s2_7, M4(1.354e-01, 2.906e-03, 6.018e-02, 8.451e-02, -1.123e-02, -1.011e-01, 7.115e-02, -8.680e-02, 2.088e-01, 2.977e-01, 8.741e-02, 2.908e-05, -2.357e-01, 3.996e-02, -1.434e-02, -1.192e-01), r);
	r = MulAdd(s2_8, M4(2.121e-03, 2.972e-02, -5.123e-02, -5.465e-03, -1.124e-01, 7.063e-02, 1.180e-02, -1.732e-01, -1.739e-01, 8.947e-02, -8.452e-02, 4.847e-01, -5.393e-02, -4.491e-02, -3.357e-02, -5.699e-03), r);
	r = MulAdd(s3_0, M4(4.013e-01, 1.153e-01, 5.353e-03, -2.882e-01, -5.749e-02, 5.224e-02, -2.923e-02, -3.499e-02, 2.801e-02, -1.767e-02, 9.020e-02, -8.928e-02, 9.319e-02, -3.256e-02, -4.600e-02, -2.629e-01), r);
	r = MulAdd(s3_1, M4(1.098e-01, -1.177e-03, -2.211e-01, 1.097e-02, 3.512e-02, 1.884e-02, 2.745e-01, -1.055e-01, 3.093e-02, 7.905e-02, 5.369e-02, 1.962e-01, -8.069e-03, -1.057e-01, -5.889e-02, 6.130e-02), r);
	r = MulAdd(s3_2, M4(1.107e-01, -2.817e-02, 8.050e-02, 6.254e-02, -1.035e-02, -7.545e-02, -1.390e-01, -8.220e-02, -7.958e-02, -1.110e-01, -2.425e-01, -8.612e-02, 8.152e-03, -1.041e-01, 2.705e-01, -2.947e-03), r);
	r = MulAdd(s3_3, M4(-1.766e-01, 2.151e-01, 5.500e-01, 1.463e-01, -3.390e-01, -9.116e-02, -1.704e-01, 9.677e-02, -1.434e-01, 2.198e-01, 4.151e-01, -1.036e-01, 2.908e-01, 1.355e-01, -3.566e-01, -1.082e-01), r);
	r = MulAdd(s3_4, M4(1.299e-01, -6.859e-02, -4.033e-01, 2.197e-01, -1.170e-01, 9.324e-03, -2.593e-01, 1.434e-01, -1.492e-01, 4.124e-01, 3.600e-01, -1.676e-01, 2.548e-02, -1.581e-02, 1.796e-01, -3.075e-01), r);
	r = MulAdd(s3_5, M4(-5.624e-02, 1.779e-02, 5.036e-02, -1.849e-01, -1.064e-01, -1.738e-02, 3.161e-01, 5.391e-02, -5.693e-02, 3.279e-01, -6.516e-02, -3.823e-02, -1.512e-01, 9.958e-03, -4.383e-02, -1.996e-01), r);
	r = MulAdd(s3_6, M4(-1.586e-01, -2.236e-03, -8.471e-02, -1.331e-01, -3.366e-02, -3.968e-01, -2.313e-01, -5.757e-01, 1.606e-01, 3.686e-02, 1.387e-02, 9.619e-02, -1.653e-01, -1.318e-01, -1.257e-01, -1.748e-01), r);
	r = MulAdd(s3_7, M4(1.938e-01, 1.519e-02, 2.123e-02, 1.121e-01, 5.456e-02, -1.568e-01, 2.224e-01, -5.000e-01, -6.363e-02, 2.205e-01, -3.627e-02, 1.547e-01, 8.136e-03, 2.279e-02, 4.444e-01, -5.358e-02), r);
	r = MulAdd(s3_8, M4(-5.716e-02, -5.222e-03, -1.156e-02, -6.042e-02, -1.024e-01, 7.483e-02, -2.342e-02, 1.065e-01, -5.690e-02, 6.041e-02, -1.489e-01, -5.921e-02, -1.887e-02, -2.115e-01, 6.895e-02, -7.626e-02), r);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -2.216e-03, -1.314e-01, -1.773e-02, 2.580e-02 };
	r = MulAdd(s0_0, M4(-1.716e-02, -1.345e-01, 1.032e-01, 4.104e-02, 8.711e-02, 8.589e-02, 8.094e-02, -4.856e-02, 1.430e-01, 7.623e-02, -6.074e-02, 9.124e-02, 3.848e-03, -2.421e-02, -3.542e-02, 2.262e-02), r);
	r = MulAdd(s0_1, M4(5.328e-02, -4.876e-02, 2.747e-01, -3.134e-01, 5.165e-02, -8.168e-02, -9.645e-02, 9.013e-02, 5.684e-01, 3.945e-01, -5.757e-02, 3.315e-01, 2.290e-01, -2.012e-01, -1.305e-01, 1.678e-01), r);
	r = MulAdd(s0_2, M4(-5.859e-02, -1.364e-01, 1.517e-02, -2.630e-01, 2.100e-02, 1.196e-02, -6.509e-03, 3.381e-02, 1.752e-01, 1.950e-01, 3.989e-02, -3.574e-03, -1.470e-01, -2.938e-01, 9.845e-02, 8.800e-02), r);
	r = MulAdd(s0_3, M4(-3.231e-02, -1.792e-01, 4.669e-01, 2.254e-01, -5.143e-03, -1.238e-01, -1.704e-01, -2.428e-01, -1.350e-01, -1.620e-01, -2.490e-01, 5.236e-02, -9.524e-02, -5.944e-02, -1.270e-01, 2.059e-01), r);
	r = MulAdd(s0_4, M4(-6.691e-02, 8.980e-03, 1.013e-02, 1.386e-01, -8.353e-02, -1.772e-01, -6.338e-02, -3.381e-01, -6.478e-02, 2.335e-01, 6.344e-02, 2.214e-01, 1.850e-01, 2.665e-02, -1.302e-01, -3.592e-01), r);
	r = MulAdd(s0_5, M4(-6.946e-02, -3.227e-03, 1.130e-01, -3.646e-01, 6.317e-02, -2.508e-01, 1.449e-02, 1.909e-01, 4.092e-02, -1.928e-01, 4.181e-02, 1.998e-01, -1.042e-01, -1.643e-01, 1.123e-01, -1.423e-01), r);
	r = MulAdd(s0_6, M4(4.571e-04, -2.226e-01, 1.542e-01, -1.980e-01, 2.272e-01, 6.605e-02, -1.823e-01, -3.202e-01, -1.017e-02, 4.403e-02, -1.199e-01, -6.896e-02, -2.050e-02, -4.035e-03, -2.686e-02, 6.758e-02), r);
	r = MulAdd(s0_7, M4(-2.184e-02, 2.620e-01, -1.598e-01, -1.310e-01, -1.282e-02, -6.247e-02, -5.500e-02, -1.549e-01, -1.201e-02, 2.664e-02, -1.159e-01, 2.574e-02, -4.379e-02, 4.502e-02, -1.428e-01, 2.220e-01), r);
	r = MulAdd(s0_8, M4(8.156e-03, -1.909e-03, 1.260e-01, -8.108e-02, 8.324e-02, -1.054e-01, 3.860e-02, -5.290e-02, -1.608e-03, -3.940e-02, -1.942e-02, -8.427e-02, -5.776e-02, -1.097e-01, -1.933e-02, 1.763e-01), r);
	r = MulAdd(s1_0, M4(-2.796e-02, -6.590e-02, -1.873e-01, 2.170e-01, -8.312e-02, -2.841e-02, 1.736e-01, -1.202e-01, 6.437e-02, 1.920e-03, 1.832e-01, -2.252e-01, 1.106e-01, 1.346e-01, 8.420e-03, -2.089e-01), r);
	r = MulAdd(s1_1, M4(7.340e-02, 2.419e-02, 8.738e-03, 1.919e-01, -6.706e-02, -1.429e-02, -9.722e-02, -1.269e-01, -4.527e-02, -1.184e-01, -3.381e-03, -1.308e-01, 1.871e-01, 2.487e-01, -2.376e-02, -1.436e-01), r);
	r = MulAdd(s1_2, M4(1.461e-01, 1.143e-01, -1.074e-01, 8.386e-02, -1.447e-01, -1.686e-01, 9.007e-02, -1.183e-01, 5.843e-02, -7.875e-02, -6.239e-02, 5.076e-02, -8.418e-02, -5.289e-02, 7.695e-02, -4.506e-02), r);
	r = MulAdd(s1_3, M4(-1.277e-01, -9.044e-02, -2.494e-01, 2.466e-01, -3.309e-02, -1.805e-01, 2.197e-01, 9.758e-02, 2.339e-02, -2.082e-01, 2.091e-01, -3.370e-01, 1.096e-01, 1.691e-01, -1.687e-01, 1.317e-03), r);
	r = MulAdd(s1_4, M4(7.503e-02, -9.570e-02, -3.911e-01, 3.020e-01, -3.351e-01, 3.397e-01, -1.038e-01, 4.915e-01, -2.002e-01, 2.340e-01, 1.700e-01, 1.536e-01, 2.331e-01, 3.055e-01, -7.175e-02, -2.936e-01), r);
	r = MulAdd(s1_5, M4(1.693e-01, 6.752e-02, -6.320e-02, 1.731e-01, 5.053e-02, 1.589e-01, -3.209e-02, 2.180e-02, -2.516e-02, -3.550e-01, -6.057e-02, 1.766e-01, 6.070e-02, 2.377e-01, 1.420e-01, -1.750e-01), r);
	r = MulAdd(s1_6, M4(5.968e-02, 1.878e-01, -7.204e-02, -1.839e-01, 9.632e-02, 4.321e-02, 1.055e-02, -5.708e-02, -5.873e-02, -3.095e-02, -2.059e-02, 1.277e-02, -1.520e-02, 1.516e-01, -2.994e-01, -7.694e-02), r);
	r = MulAdd(s1_7, M4(-5.850e-02, 3.689e-02, -2.758e-01, -5.820e-02, -7.878e-02, 1.943e-01, 1.318e-01, -6.132e-03, 3.420e-02, 1.539e-01, -1.353e-01, 1.066e-01, -1.837e-02, 1.026e-01, -2.007e-01, 1.803e-02), r);
	r = MulAdd(s1_8, M4(3.459e-02, -7.917e-02, -6.715e-02, -6.375e-02, 5.512e-02, -1.532e-01, -1.008e-02, 3.281e-02, 2.993e-02, 6.528e-02, 1.202e-02, -1.341e-01, 1.046e-01, 2.568e-01, -3.195e-02, -6.128e-03), r);
	r = MulAdd(s2_0, M4(-1.676e-01, 6.091e-02, 1.021e-01, 1.900e-01, 6.284e-01, 3.015e-01, 1.076e-01, -4.315e-01, 9.855e-02, 7.527e-02, 1.173e-01, 1.454e-01, -1.936e-02, -7.795e-02, 4.561e-02, -1.009e-01), r);
	r = MulAdd(s2_1, M4(-1.538e-01, 1.924e-01, -2.160e-01, 1.121e-02, 3.876e-01, 6.074e-01, 1.912e-01, 1.107e+00, -1.019e-01, 2.046e-02, 4.697e-01, 2.091e-02, -1.841e-01, -4.174e-02, 2.999e-01, 8.326e-02), r);
	r = MulAdd(s2_2, M4(-9.220e-02, 1.825e-01, -1.562e-01, 3.358e-01, 5.226e-02, 6.678e-01, 3.058e-01, 2.938e-01, 1.706e-01, 8.658e-02, -6.464e-02, -3.511e-01, 4.983e-02, 1.655e-01, 6.151e-02, -2.758e-01), r);
	r = MulAdd(s2_3, M4(-1.193e-01, -8.883e-03, -6.676e-02, -1.863e-01, 5.507e-01, 2.139e-01, -2.313e-01, -3.117e-01, 7.110e-03, -2.253e-02, 2.619e-02, -1.450e-01, 2.895e-02, 1.838e-02, 1.456e-02, -1.848e-02), r);
	r = MulAdd(s2_4, M4(8.266e-01, 1.957e-01, -5.592e-01, -1.102e+00, -1.108e+00, -2.799e-01, 2.460e+00, -2.017e+00, -1.304e-01, 5.934e-02, 2.235e-01, -1.531e-01, 2.761e-01, -2.611e-02, -2.482e-01, -2.947e-02), r);
	r = MulAdd(s2_5, M4(1.606e-01, 1.278e-02, -9.921e-02, 9.180e-02, 1.138e-01, -3.889e-02, 1.627e-01, 2.813e+00, 5.255e-02, -2.075e-02, -1.651e-01, -1.358e-01, -1.813e-01, 3.018e-01, 3.389e-04, -3.242e-01), r);
	r = MulAdd(s2_6, M4(-1.029e-02, 1.966e-01, 9.284e-03, 3.095e-01, -1.677e-01, -5.509e-02, -1.289e+00, -3.562e-02, 2.497e-02, 1.040e-01, -1.576e-01, -1.242e-01, -6.614e-02, -2.606e-02, -2.151e-02, 9.371e-02), r);
	r = MulAdd(s2_7, M4(2.008e-01, -2.042e-01, 3.626e-03, -8.524e-03, -3.760e-01, 3.521e-01, -4.482e-01, -7.956e-01, -1.076e-01, -2.298e-02, 1.228e-01, 2.675e-01, 8.843e-02, 2.774e-01, -1.114e-01, 6.773e-02), r);
	r = MulAdd(s2_8, M4(7.028e-03, 2.032e-01, -2.760e-01, -5.645e-02, 7.888e-02, 4.420e-01, 1.608e-01, 7.254e-02, 8.015e-02, -1.828e-01, -1.142e-02, -2.063e-02, -1.091e-01, -3.137e-02, 2.670e-02, -1.370e-03), r);
	r = MulAdd(s3_0, M4(3.985e-02, -4.648e-04, -6.229e-02, 1.024e-02, 2.510e-02, -1.833e-01, -2.537e-02, -5.906e-02, 1.551e-01, 1.097e+00, -9.003e-01, 9.391e-01, -1.978e-02, -5.407e-02, 2.358e-01, 1.039e-01), r);
	r = MulAdd(s3_1, M4(-1.454e-02, 6.619e-02, -5.213e-03, -5.554e-02, 4.323e-02, 1.004e-01, 1.153e-01, 1.370e-01, 4.502e-01, 3.804e-01, 2.568e-01, 7.781e-01, -3.095e-02, 3.557e-02, 1.035e-01, -2.194e-02), r);
	r = MulAdd(s3_2, M4(-3.902e-02, -1.607e-01, 5.826e-02, 1.034e-02, -2.753e-03, 1.175e-01, 3.703e-02, 1.375e-02, 7.297e-02, 2.820e-01, 5.462e-04, -5.188e-02, 2.212e-01, 1.518e-01, -1.325e-01, 9.074e-02), r);
	r = MulAdd(s3_3, M4(-5.867e-02, -1.032e-01, -2.187e-02, 2.861e-02, 6.211e-02, 8.308e-02, -1.475e-01, 3.146e-02, -1.838e-01, -1.555e-01, -8.405e-01, -3.969e-01, -2.232e-02, -1.254e-01, 1.786e-01, -2.215e-01), r);
	r = MulAdd(s3_4, M4(1.142e-01, -2.823e-01, -3.135e-01, 2.494e-01, -2.510e-01, 1.359e-01, 2.920e-01, 3.156e-03, -1.326e-01, -4.658e-01, 2.085e-01, -2.149e-01, 3.455e-01, 5.045e-01, -3.612e-01, 2.250e-01), r);
	r = MulAdd(s3_5, M4(9.107e-03, -2.388e-01, 8.027e-02, -1.974e-01, 2.763e-01, -7.958e-02, 1.082e-01, 1.907e-01, 1.002e-01, -7.503e-02, 1.903e-02, 1.209e-01, -3.394e-01, -5.029e-03, -2.791e-01, -3.699e-02), r);
	r = MulAdd(s3_6, M4(-1.739e-02, 6.326e-02, 8.739e-02, 6.420e-02, -3.866e-02, -9.341e-02, -2.571e-01, -1.428e-01, -4.662e-02, 8.787e-02, -2.799e-01, -2.134e-01, 1.020e-02, -2.975e-02, 7.063e-02, -1.584e-01), r);
	r = MulAdd(s3_7, M4(1.056e-01, -1.006e-01, 1.313e-01, 1.137e-02, -9.836e-02, -4.874e-02, -2.130e-01, -1.634e-01, -5.612e-02, -1.760e-01, -1.908e-01, 1.199e-01, -1.486e-02, 1.704e-01, -7.049e-03, -1.845e-01), r);
	r = MulAdd(s3_8, M4(-1.343e-01, -2.351e-02, -1.138e-01, 1.969e-02, -2.157e-02, -2.178e-01, 6.627e-02, 2.071e-02, -8.215e-02, -2.077e-01, 6.798e-02, 3.783e-02, -4.294e-02, 2.298e-01, 3.238e-01, -3.153e-01), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 5.906e-02, -2.826e-02, 3.865e-02, 1.167e-02 };
	r = MulAdd(s0_0, M4(1.193e-01, -1.140e-01, -1.286e-01, 1.857e-01, 1.764e-02, 2.680e-02, -2.096e-02, -2.392e-02, 1.995e-02, -1.275e-01, -1.179e-02, -9.041e-02, 2.080e-02, -2.054e-02, -6.917e-02, -3.731e-02), r);
	r = MulAdd(s0_1, M4(5.910e-02, -3.300e-02, 2.187e-01, -4.265e-02, -1.050e-02, -3.859e-02, 7.390e-02, 1.629e-02, 4.529e-02, -1.788e-01, 4.091e-01, -6.928e-02, 1.657e-01, -4.254e-03, -3.151e-01, 2.919e-01), r);
	r = MulAdd(s0_2, M4(-4.341e-02, 2.223e-02, -1.008e-01, 8.143e-02, -9.846e-03, 5.909e-02, -2.690e-02, 3.039e-02, 1.086e-01, -6.404e-02, -6.064e-02, -1.898e-01, 9.368e-02, 3.736e-03, -3.105e-02, 6.931e-02), r);
	r = MulAdd(s0_3, M4(-3.157e-02, -1.135e-02, -4.165e-01, -1.928e-01, -7.721e-02, 1.880e-01, -1.479e-01, 1.343e-01, 1.394e-01, -1.888e-01, 4.132e-02, 3.282e-02, 1.008e-01, 3.034e-02, -1.672e-01, 2.711e-01), r);
	r = MulAdd(s0_4, M4(5.348e-02, -7.725e-02, 1.096e-01, 1.428e-01, 2.230e-02, 4.346e-01, -1.861e-01, 2.064e-01, 9.964e-02, -1.543e-01, 2.040e-01, -2.667e-01, 2.274e-01, -4.279e-02, 2.191e-01, 1.740e-01), r);
	r = MulAdd(s0_5, M4(-4.585e-02, 7.224e-02, 2.702e-02, -5.440e-02, -8.830e-02, 1.219e-01, -5.870e-02, -1.575e-02, -2.895e-01, 1.297e-01, -2.219e-01, -3.090e-01, 1.459e-02, 6.562e-02, 8.912e-02, 3.619e-02), r);
	r = MulAdd(s0_6, M4(5.821e-02, -1.235e-01, 2.805e-01, 1.661e-02, -1.411e-01, 4.756e-01, 1.758e-02, 2.091e-01, -7.398e-02, -1.141e-02, 9.094e-03, 5.836e-02, -5.062e-02, -1.953e-02, 9.519e-03, 1.514e-01), r);
	r = MulAdd(s0_7, M4(-2.608e-02, 5.385e-02, 2.445e-01, -2.328e-02, -1.261e-02, 3.237e-01, -1.273e-01, -9.910e-02, 6.702e-03, -9.507e-03, 1.413e-01, -8.367e-02, 6.338e-02, 7.485e-02, 1.797e-01, -7.493e-03), r);
	r = MulAdd(s0_8, M4(-1.616e-01, 9.600e-02, 5.955e-02, -1.206e-01, -1.295e-02, -7.217e-03, -9.834e-02, 1.012e-02, -9.242e-02, -8.508e-03, -8.789e-02, -1.357e-01, 3.131e-02, 1.343e-02, -8.285e-02, 1.658e-02), r);
	r = MulAdd(s1_0, M4(-1.426e-02, 1.560e-02, -2.351e-01, 1.411e-02, 6.414e-02, -1.062e-02, -6.187e-02, 3.851e-02, -1.255e-01, -2.124e-01, 5.334e-02, -1.702e-01, -1.592e-02, 8.683e-02, 3.298e-02, -1.396e-01), r);
	r = MulAdd(s1_1, M4(2.771e-02, 1.236e-01, 3.715e-01, 3.765e-03, 1.343e-01, -8.645e-02, -2.129e-01, 2.529e-01, -3.053e-01, 6.211e-02, 6.285e-01, 6.935e-02, -1.150e-01, -1.477e-01, 1.362e-01, -7.007e-02), r);
	r = MulAdd(s1_2, M4(-1.013e-01, 2.877e-02, -1.274e-01, 6.261e-02, 4.531e-02, -6.713e-02, -1.651e-02, -9.269e-02, -1.616e-01, -3.828e-03, -3.715e-01, 1.978e-03, 2.994e-02, 5.941e-02, -1.039e-01, -9.121e-02), r);
	r = MulAdd(s1_3, M4(1.614e-01, 2.806e-01, -5.564e-01, 1.712e-01, 3.865e-02, -9.885e-02, 1.400e-01, -1.006e-01, 3.954e-02, -7.045e-02, 2.290e-01, 5.378e-03, -4.572e-02, 5.276e-02, 3.839e-02, 7.448e-02), r);
	r = MulAdd(s1_4, M4(-2.672e-02, 2.006e-01, 2.841e-01, -1.999e-02, 1.419e-02, 8.672e-02, 2.153e-01, 2.087e-01, 1.046e-02, 9.963e-01, 6.317e-01, 7.537e-01, -2.202e-01, -7.516e-02, 3.076e-01, -1.556e-01), r);
	r = MulAdd(s1_5, M4(-3.143e-01, 5.866e-02, -1.605e-01, -1.615e-01, -8.577e-02, 7.590e-03, 2.199e-01, 9.684e-03, -2.125e-02, -2.988e-02, -4.535e-01, 5.538e-02, -2.528e-01, 1.243e-01, 1.700e-01, -1.200e-01), r);
	r = MulAdd(s1_6, M4(-1.574e-01, 6.763e-02, 4.039e-02, 1.469e-01, 2.257e-03, 1.424e-01, -1.655e-01, 8.329e-02, -7.065e-02, -2.476e-01, -9.983e-02, 1.232e-01, -7.377e-02, 1.521e-02, -3.496e-02, -1.429e-02), r);
	r = MulAdd(s1_7, M4(-3.125e-04, -2.618e-03, -6.887e-02, -2.244e-01, -4.241e-02, -1.978e-01, 1.768e-01, -1.142e-01, 3.186e-02, -8.708e-02, 1.963e-01, 1.347e-01, -9.015e-02, -1.290e-01, 1.373e-01, -1.238e-01), r);
	r = MulAdd(s1_8, M4(-2.192e-01, 5.605e-04, -1.505e-01, -2.801e-01, 1.144e-01, -1.020e-01, -5.691e-02, 1.329e-01, -4.557e-02, -5.067e-02, -5.847e-03, 1.539e-02, -2.222e-01, -1.948e-02, -7.628e-02, -1.468e-01), r);
	r = MulAdd(s2_0, M4(1.248e-02, -6.067e-02, 6.221e-03, -2.587e-01, 1.340e-01, 9.183e-02, 1.780e-01, 1.129e-01, -5.239e-03, 7.200e-03, -8.953e-03, 1.245e-01, -5.878e-02, -5.848e-02, 4.972e-02, -1.553e-02), r);
	r = MulAdd(s2_1, M4(5.369e-02, -1.389e-01, 2.474e-01, -9.388e-02, 1.382e-01, -2.378e-01, 1.034e+00, -4.127e-01, -5.765e-02, -2.028e-01, -5.532e-02, -6.872e-02, -1.059e-02, 7.206e-03, -2.455e-01, -4.938e-02), r);
	r = MulAdd(s2_2, M4(1.712e-01, -7.478e-02, 2.833e-01, -1.819e-01, 3.654e-01, -1.324e-01, 2.584e-01, -3.937e-01, 1.282e-01, -7.266e-02, 7.739e-02, 1.304e-01, -4.748e-02, -1.578e-01, 4.012e-02, -3.365e-02), r);
	r = MulAdd(s2_3, M4(-1.748e-01, -1.185e-01, 3.252e-01, 8.781e-02, 3.503e-01, -6.309e-01, 1.597e+00, 9.112e-02, -3.477e-01, -7.516e-02, -5.034e-02, -2.982e-01, 1.566e-01, -7.671e-03, -5.705e-02, 4.100e-02), r);
	r = MulAdd(s2_4, M4(-7.281e-01, 7.474e-01, -7.460e-01, -5.525e-01, 6.676e-01, -2.384e+00, -2.085e+00, 2.139e+00, 5.178e-02, -2.687e-01, 4.824e-02, 4.211e-02, -4.863e-03, -2.555e-01, 1.548e-01, -5.389e-02), r);
	r = MulAdd(s2_5, M4(2.294e-01, 4.563e-02, -1.082e-01, 1.918e-01, -4.791e-01, -3.667e-01, -9.200e-01, -7.338e-02, 2.055e-01, 1.443e-01, -1.102e-01, 3.247e-01, -1.077e-01, 1.510e-01, 9.772e-02, -9.652e-02), r);
	r = MulAdd(s2_6, M4(-7.180e-02, -6.766e-02, 4.233e-02, 2.346e-01, 2.416e-01, -5.872e-01, -3.803e-01, 1.459e-01, 1.314e-01, 1.762e-01, 2.180e-01, -6.325e-02, -2.639e-02, -6.961e-02, -6.823e-02, 1.208e-01), r);
	r = MulAdd(s2_7, M4(-1.771e-02, 8.693e-01, 6.895e-02, 1.746e-01, 7.339e-01, -2.318e+00, -9.599e-02, 6.005e-03, -2.361e-01, 4.397e-03, 4.241e-02, -3.560e-01, -2.276e-01, 2.720e-01, -2.148e-03, 1.158e-01), r);
	r = MulAdd(s2_8, M4(1.415e-02, 4.584e-02, -3.553e-01, 5.112e-02, -6.470e-01, -1.005e+00, -4.282e-02, -9.747e-01, -4.664e-02, 1.182e-01, -1.638e-01, 3.781e-02, 9.961e-02, 6.290e-02, 1.452e-01, -2.967e-02), r);
	r = MulAdd(s3_0, M4(-1.042e-02, 1.059e-02, 8.109e-02, -1.976e-02, 5.573e-02, 1.743e-01, 2.189e-03, 1.850e-01, -2.536e-01, 2.196e-01, 9.472e-01, -4.919e-01, 1.358e-02, -1.083e-02, -4.265e-01, -5.187e-02), r);
	r = MulAdd(s3_1, M4(8.612e-02, -1.189e-01, -2.101e-01, 1.060e-01, 1.314e-01, -1.630e-02, -5.186e-02, 4.664e-02, -6.413e-01, -3.610e-01, 4.229e-01, -4.334e-01, 4.259e-02, -1.822e-02, 4.311e-01, -2.530e-01), r);
	r = MulAdd(s3_2, M4(2.664e-02, 2.255e-02, 3.200e-02, -1.377e-01, -1.708e-01, -8.546e-02, 8.941e-02, -8.314e-02, -2.572e-01, -4.522e-03, 2.598e-02, -3.987e-02, -7.309e-02, -7.170e-02, 3.738e-01, -1.056e-01), r);
	r = MulAdd(s3_3, M4(1.327e-03, -1.352e-01, -1.486e-01, -4.253e-03, 9.866e-02, 2.134e-01, 2.813e-01, 1.472e-01, 3.699e-01, -6.387e-01, 4.480e-01, -5.420e-01, 6.457e-02, -1.278e-01, 1.867e-01, -2.355e-01), r);
	r = MulAdd(s3_4, M4(-1.330e-01, -1.691e-01, 3.231e-01, -2.959e-01, 2.198e-01, -1.739e-02, -6.569e-02, 3.037e-01, -2.062e-01, 9.855e-02, -5.762e-01, -1.501e-02, 4.521e-02, 2.517e-01, -3.916e-01, -3.841e-01), r);
	r = MulAdd(s3_5, M4(2.251e-01, 2.664e-02, 3.331e-02, 2.836e-01, -2.527e-01, 4.076e-02, -1.331e-02, -3.068e-03, -4.003e-04, -3.312e-02, -4.418e-02, -7.245e-02, 1.807e-01, -5.017e-01, 6.191e-02, -6.283e-01), r);
	r = MulAdd(s3_6, M4(-5.410e-02, 7.319e-02, -4.022e-02, -5.598e-02, 1.675e-01, -1.364e-02, 3.302e-02, 2.473e-03, 1.630e-01, -1.955e-02, 2.311e-02, -1.402e-01, -1.078e-02, -1.675e-01, 1.211e-01, -3.069e-03), r);
	r = MulAdd(s3_7, M4(-3.596e-02, -1.378e-02, 1.912e-01, 1.530e-02, 1.993e-01, -2.688e-01, -6.555e-02, -2.267e-01, -9.549e-02, -1.909e-01, -1.314e-01, -1.145e-01, -3.460e-01, -2.630e-02, 3.692e-01, -1.209e-01), r);
	r = MulAdd(s3_8, M4(2.329e-01, 4.865e-02, 1.411e-02, 1.037e-01, -3.701e-01, -9.056e-02, -1.421e-01, -3.640e-01, -2.971e-02, -8.960e-02, 4.366e-03, -1.232e-01, -5.654e-02, -6.378e-02, 2.785e-01, -1.645e-01), r);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -9.721e-03, -6.824e-03, 1.589e-03, -2.346e-03 };
	r = MulAdd(s0_0, M4(1.057e-01, 8.402e-02, -7.690e-02, -5.544e-02, -1.356e-02, -1.211e-01, -3.841e-02, 7.556e-02, -1.057e-01, -6.017e-02, -5.377e-03, -3.373e-02, 2.015e-02, -3.443e-01, 4.561e-03, -1.471e-01), r);
	r = MulAdd(s0_1, M4(-9.330e-02, -3.572e-02, 1.283e-01, 1.837e-02, -1.698e-02, -7.926e-02, -3.830e-02, -6.985e-02, 1.791e-01, -1.764e-02, -2.987e-01, -2.914e-02, -1.109e-01, -2.236e-01, 1.116e-02, 1.155e-03), r);
	r = MulAdd(s0_2, M4(-1.422e-02, -3.524e-02, 6.026e-02, 2.085e-03, 3.122e-02, -8.025e-02, -2.071e-01, 6.779e-03, 3.087e-02, -1.078e-01, -1.336e-01, 3.374e-02, 4.059e-02, -1.509e-01, -2.500e-02, 1.575e-02), r);
	r = MulAdd(s0_3, M4(2.079e-01, 1.217e-01, -9.225e-02, -1.526e-01, 2.831e-01, -2.399e-01, -1.879e-01, 3.896e-02, -2.888e-02, 1.898e-01, 7.200e-03, 1.166e-01, 4.709e-01, -2.553e-01, -8.659e-02, -1.966e-01), r);
	r = MulAdd(s0_4, M4(-2.466e-01, 1.668e-01, -3.000e-01, 1.331e-01, 1.691e-01, -8.165e-02, 1.461e-01, -2.816e-01, 4.627e-01, -5.020e-02, 4.608e-02, -4.259e-01, 1.733e-01, -2.493e-01, -2.859e-01, 1.255e-01), r);
	r = MulAdd(s0_5, M4(-1.061e-01, -1.312e-02, 4.457e-03, 6.950e-02, -1.528e-02, -8.325e-03, 9.288e-03, 4.323e-04, -1.349e-01, -4.300e-02, -1.214e-01, -6.221e-02, 1.326e-01, 1.729e-02, -9.435e-03, -8.256e-02), r);
	r = MulAdd(s0_6, M4(-2.764e-01, 2.277e-01, -6.479e-02, 9.793e-02, -1.252e-01, -2.225e-01, -3.927e-02, 9.069e-03, -3.082e-02, -9.839e-02, -1.432e-02, 4.213e-03, 1.038e-01, -1.336e-02, -8.449e-02, 9.390e-03), r);
	r = MulAdd(s0_7, M4(1.321e-01, -1.783e-01, 3.953e-02, -4.840e-03, -8.847e-02, 3.953e-02, -2.180e-03, 3.542e-02, -1.772e-01, -9.279e-02, -1.972e-02, -1.933e-02, 9.595e-02, -7.297e-02, -2.008e-02, 2.667e-02), r);
	r = MulAdd(s0_8, M4(-1.079e-02, -5.796e-02, 8.349e-02, 9.407e-03, -6.521e-02, 2.585e-02, 3.029e-02, -2.179e-02, 1.644e-02, 2.894e-02, 9.475e-02, 2.535e-02, -1.062e-01, 1.059e-01, 5.134e-02, -4.920e-02), r);
	r = MulAdd(s1_0, M4(2.292e-02, -1.255e-01, 6.694e-03, -1.429e-02, 2.778e-02, 1.610e-02, -5.433e-02, -1.691e-02, -8.172e-02, 1.096e-01, -7.444e-02, -1.744e-02, -1.607e-02, -1.718e-02, 2.261e-02, -7.930e-02), r);
	r = MulAdd(s1_1, M4(-1.606e-01, 2.755e-01, 2.518e-01, -8.121e-02, 8.844e-02, -9.943e-02, -1.479e-01, 2.317e-03, -1.857e-02, 3.330e-01, -2.844e-01, 5.172e-03, 4.147e-02, 8.858e-05, -3.684e-02, 2.822e-02), r);
	r = MulAdd(s1_2, M4(1.972e-02, 1.372e-01, 8.050e-02, -3.001e-02, -2.199e-02, -1.105e-03, -6.767e-02, -1.520e-02, 2.293e-02, 4.982e-04, 1.077e-01, -4.043e-02, -3.146e-04, 1.341e-04, -2.927e-02, 1.540e-03), r);
	r = MulAdd(s1_3, M4(6.177e-01, -2.128e-01, -1.951e-01, -2.603e-01, -2.382e-01, 4.451e-02, -6.117e-02, 2.986e-01, -1.350e-01, -3.939e-02, -2.575e-02, 6.810e-02, -5.279e-02, -4.608e-02, -8.021e-02, -7.411e-02), r);
	r = MulAdd(s1_4, M4(-1.743e-01, 3.337e-01, -2.366e-01, 1.677e-01, 1.648e-01, -1.324e-01, 4.842e-02, -1.070e-01, 1.649e-01, 5.767e-01, 8.170e-02, -1.744e-01, 1.955e-01, 1.592e-01, -2.500e-01, -7.249e-02), r);
	r = MulAdd(s1_5, M4(-1.271e-01, -1.008e-01, 7.718e-04, 6.185e-03, 2.051e-02, -1.960e-02, 5.900e-02, -7.462e-03, -9.429e-02, -6.792e-02, -1.796e-01, 7.379e-02, -3.235e-02, 4.667e-03, -8.281e-02, -1.639e-02), r);
	r = MulAdd(s1_6, M4(2.655e-01, 4.646e-02, -7.062e-02, 1.656e-01, 1.325e-01, -1.897e-01, 5.944e-04, 4.675e-02, -5.384e-02, -9.959e-02, -6.261e-05, 2.482e-03, -1.603e-01, 1.532e-03, -6.567e-02, 5.924e-02), r);
	r = MulAdd(s1_7, M4(3.912e-01, -2.348e-01, -1.125e-02, 6.278e-02, -8.276e-02, 1.717e-01, -2.386e-02, 5.405e-02, 1.651e-01, -7.568e-02, -2.812e-02, 1.120e-02, 7.600e-02, -3.569e-02, 8.237e-02, 3.588e-02), r);
	r = MulAdd(s1_8, M4(-1.545e-01, -2.594e-01, 5.109e-02, -3.375e-02, -6.946e-02, 5.645e-04, -3.457e-02, -1.802e-02, -8.377e-02, 3.700e-02, -5.377e-02, 2.370e-03, -5.512e-02, 3.772e-02, -6.696e-03, -2.474e-02), r);
	r = MulAdd(s2_0, M4(9.595e-02, -1.386e-01, 6.095e-02, 1.382e-01, -6.666e-02, 3.536e-02, 3.510e-02, 9.837e-02, -7.013e-02, 1.777e-02, -6.525e-02, 4.813e-03, 4.890e-02, 1.697e-01, 4.055e-02, 3.352e-03), r);
	r = MulAdd(s2_1, M4(-9.534e-02, -1.931e-01, 7.936e-02, -5.563e-03, 1.180e-01, -1.126e-01, -4.561e-01, 1.382e-02, -5.442e-02, 1.826e-02, -1.906e-01, 2.481e-02, 1.479e-01, 6.991e-02, -4.527e-01, 4.636e-02), r);
	r = MulAdd(s2_2, M4(2.077e-02, -1.935e-02, 2.863e-02, -1.457e-03, 8.646e-02, -3.495e-02, -1.110e-01, -5.121e-02, 1.296e-02, 1.610e-01, 1.111e-02, -5.463e-03, -6.310e-02, 9.282e-02, -1.124e-02, -5.502e-03), r);
	r = MulAdd(s2_3, M4(-2.461e-01, 2.046e-01, 7.247e-02, 4.344e-01, -2.654e-03, -1.548e-01, 5.001e-02, -2.193e-02, -2.841e-02, 4.966e-02, -5.860e-02, -1.607e-02, -3.365e-01, 3.779e-01, -7.002e-02, 2.456e-01), r);
	r = MulAdd(s2_4, M4(-3.244e-02, 4.290e-01, 8.561e-02, -8.759e-02, 5.843e-01, 2.402e-01, -7.254e-02, 1.672e-01, 9.564e-03, 6.555e-01, 1.018e-01, 2.472e-01, 3.227e-01, -2.279e-01, 2.948e-01, 1.007e-01), r);
	r = MulAdd(s2_5, M4(7.899e-02, -3.829e-02, -6.897e-02, -6.274e-02, -1.648e-01, 1.819e-01, -3.680e-02, -2.700e-02, -1.223e-01, 3.068e-02, -9.879e-02, 9.617e-02, -1.048e-01, 8.574e-03, 1.253e-01, 1.451e-02), r);
	r = MulAdd(s2_6, M4(6.709e-02, -2.087e-03, -3.317e-02, 9.660e-02, -4.432e-02, 1.087e-01, -3.855e-03, 3.000e-02, -1.565e-01, 1.285e-02, -1.122e-02, -2.268e-02, 2.213e-01, -5.280e-03, -1.048e-02, 1.095e-02), r);
	r = MulAdd(s2_7, M4(3.409e-02, 1.319e-02, -3.860e-02, 1.186e-01, -1.729e-01, 6.734e-02, -4.122e-02, -1.425e-02, -1.333e-01, 3.082e-01, -5.667e-02, 1.330e-02, -7.902e-02, -2.279e-01, -2.277e-02, -7.345e-02), r);
	r = MulAdd(s2_8, M4(-8.456e-02, 7.960e-03, -3.009e-02, 3.726e-02, -1.028e-01, 9.260e-02, -9.118e-03, -1.295e-02, 8.764e-02, 3.246e-03, -3.662e-02, 3.309e-02, -6.546e-02, -1.168e-01, 1.179e-02, -1.356e-02), r);
	r = MulAdd(s3_0, M4(2.830e-01, -1.486e-01, -6.659e-02, 7.406e-02, 3.936e-02, 1.384e-01, -8.613e-02, 5.222e-02, -2.008e-02, -3.306e-02, 3.603e-03, 8.508e-03, 6.657e-02, 8.277e-03, -1.684e-02, -3.086e-02), r);
	r = MulAdd(s3_1, M4(2.673e-02, -1.550e-01, -1.939e-01, 1.124e-01, 1.494e-01, 3.597e-01, -2.434e-01, -3.767e-02, 6.287e-02, -1.489e-02, -2.108e-01, 9.890e-02, -6.012e-02, -1.588e-01, 5.574e-02, 6.678e-02), r);
	r = MulAdd(s3_2, M4(7.056e-02, -9.550e-02, 1.579e-02, 6.754e-02, 6.387e-02, 9.595e-02, -1.968e-01, -3.396e-02, -6.625e-03, 1.153e-01, -2.162e-02, -1.236e-02, -2.182e-02, 9.085e-02, -7.137e-02, 3.357e-02), r);
	r = MulAdd(s3_3, M4(8.677e-02, -7.446e-02, 8.953e-02, 2.128e-01, 3.175e-01, 1.377e-02, 9.215e-02, 8.452e-02, -2.583e-01, 5.582e-03, 4.653e-02, 1.038e-01, -5.802e-01, -1.506e-01, -5.875e-02, -4.308e-02), r);
	r = MulAdd(s3_4, M4(-5.314e-02, 5.278e-01, 5.725e-02, -3.945e-02, 3.018e-01, -3.613e-02, 1.135e-01, 6.378e-02, 1.129e-01, 3.251e-01, -2.298e-01, 2.272e-01, -2.859e-02, -1.997e-01, -1.289e-02, -9.605e-02), r);
	r = MulAdd(s3_5, M4(-8.572e-03, 1.098e-01, 6.951e-02, -1.006e-01, 8.029e-02, -1.004e-02, 3.641e-03, 3.452e-02, -2.027e-01, -1.419e-01, 9.411e-02, 4.631e-03, -2.382e-02, -1.774e-01, 9.282e-02, 7.921e-02), r);
	r = MulAdd(s3_6, M4(-5.645e-02, 5.717e-02, -8.271e-02, 1.460e-01, 1.135e-01, -2.347e-02, 3.011e-02, 6.505e-02, -1.186e-01, 6.520e-02, 2.698e-03, -4.274e-02, 3.661e-03, 1.253e-01, 1.867e-02, 3.871e-03), r);
	r = MulAdd(s3_7, M4(9.708e-02, -1.020e-01, -1.182e-01, 1.458e-01, -1.234e-01, -1.871e-01, -2.362e-02, -5.212e-02, 2.773e-01, -6.513e-02, -4.346e-03, -4.354e-02, 3.573e-01, -2.655e-02, 4.325e-02, -2.378e-02), r);
	r = MulAdd(s3_8, M4(-8.228e-02, -2.852e-02, -2.048e-02, 3.619e-02, 9.848e-03, 4.128e-02, -4.446e-03, -9.824e-03, -1.064e-01, -5.623e-02, 4.160e-03, -3.699e-02, -6.863e-03, -2.035e-02, 1.601e-03, -4.166e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -2.548e-03, 2.631e-03, -1.796e-03, 6.472e-05 };
	r = MulAdd(s0_0, M4(-2.150e-02, 3.627e-02, 1.098e-01, -2.531e-02, 7.604e-03, 6.546e-02, -3.447e-01, 5.052e-03, -5.728e-02, 1.302e-02, 1.200e-01, -3.060e-02, 3.626e-02, 8.441e-03, -1.570e-01, 1.402e-02), r);
	r = MulAdd(s0_1, M4(-3.360e-02, -1.269e-01, 1.234e-02, 9.393e-02, -1.762e-01, 2.675e-01, -1.063e-01, -8.648e-02, -1.491e-01, 1.604e-01, -1.693e-01, 5.005e-02, 1.166e-01, -1.870e-02, 8.843e-02, 1.417e-02), r);
	r = MulAdd(s0_2, M4(6.443e-02, 2.579e-02, 5.084e-02, -2.141e-02, -4.518e-02, -6.926e-02, -7.947e-02, -5.832e-03, 7.140e-02, -1.427e-01, 1.275e-01, 7.012e-02, -1.982e-03, -6.910e-02, -2.167e-01, 1.330e-02), r);
	r = MulAdd(s0_3, M4(-2.132e-02, 2.807e-02, 1.044e-01, 1.442e-02, 3.026e-01, 1.629e-01, 1.141e-01, 1.649e-01, 6.632e-02, 1.488e-02, -1.725e-01, 3.376e-02, -4.740e-01, 5.156e-03, -2.510e-01, 6.820e-02), r);
	r = MulAdd(s0_4, M4(-7.393e-02, 1.460e-01, -2.561e-02, 2.588e-01, 2.248e-01, 1.511e-01, 2.192e-01, -1.222e-01, 4.382e-01, 3.200e-02, 3.125e-02, -3.612e-01, -1.890e-01, -2.971e-01, -1.180e-01, -1.741e-01), r);
	r = MulAdd(s0_5, M4(-8.022e-03, 3.348e-03, -9.508e-02, -5.842e-03, 2.266e-02, -3.213e-02, -1.024e-01, -8.443e-02, 2.648e-03, -1.541e-01, 8.091e-02, -4.184e-02, -2.928e-02, 9.030e-02, -1.811e-01, -1.895e-02), r);
	r = MulAdd(s0_6, M4(2.134e-01, 1.204e-02, 2.013e-02, -8.178e-02, 1.413e-02, 5.513e-02, 1.111e-01, 2.577e-01, 1.495e-03, -5.161e-02, 1.933e-02, 7.202e-02, -1.053e-01, -6.070e-03, -1.038e-01, -1.344e-01), r);
	r = MulAdd(s0_7, M4(3.203e-01, -1.824e-01, -4.901e-02, -2.154e-01, 3.227e-02, 2.323e-02, 1.477e-02, 1.885e-02, 1.952e-01, -3.905e-02, 3.536e-02, 1.957e-01, 2.784e-01, -1.506e-01, -1.907e-01, 5.836e-02), r);
	r = MulAdd(s0_8, M4(-7.886e-02, -1.750e-01, 4.233e-02, -2.816e-01, -1.097e-02, 5.733e-03, -2.752e-02, 3.943e-03, -6.800e-02, -1.612e-02, -5.243e-02, -1.074e-02, -8.493e-02, 9.050e-02, -7.421e-02, -1.520e-01), r);
	r = MulAdd(s1_0, M4(1.024e-01, -3.071e-02, -7.767e-02, -1.160e-02, -1.668e-01, -7.681e-03, 7.039e-02, 4.690e-02, -4.672e-02, 3.627e-03, 2.337e-02, -1.154e-02, 2.441e-03, 1.719e-02, 3.137e-02, 2.789e-02), r);
	r = MulAdd(s1_1, M4(1.128e-01, 4.853e-02, 3.627e-01, 5.215e-03, -4.474e-02, -3.383e-02, 5.492e-03, 1.021e-01, -3.035e-02, 7.184e-03, 9.787e-02, 6.902e-02, 2.559e-02, 6.446e-03, -1.559e-01, 1.046e-01), r);
	r = MulAdd(s1_2, M4(5.861e-02, 8.621e-02, -4.399e-03, 1.267e-02, 1.553e-02, -1.611e-02, 5.352e-02, 2.444e-03, -3.847e-02, 1.450e-01, 2.706e-01, -1.122e-01, -2.804e-03, 1.538e-02, 6.326e-02, -8.110e-02), r);
	r = MulAdd(s1_3, M4(-4.627e-01, -2.751e-02, 1.883e-02, 6.767e-02, 1.365e-03, -7.980e-02, -1.068e-01, -8.643e-02, -1.110e-01, -5.781e-02, 2.143e-01, 6.533e-02, 9.844e-02, -1.375e-01, 1.719e-01, 1.652e-02), r);
	r = MulAdd(s1_4, M4(-3.445e-01, 1.223e-01, -5.479e-01, 1.608e-01, 2.282e-01, 4.579e-02, 2.864e-02, -2.323e-01, 3.533e-01, -1.573e-01, 5.705e-02, -1.202e-01, -3.604e-02, 7.107e-02, -7.818e-02, -1.104e-01), r);
	r = MulAdd(s1_5, M4(9.595e-02, 2.793e-01, -1.118e-01, -7.789e-02, -5.870e-02, -9.267e-02, 1.003e-01, -7.460e-02, 3.747e-02, -4.606e-01, 1.714e-01, -7.485e-04, -1.787e-02, -2.575e-02, 8.273e-02, -3.576e-02), r);
	r = MulAdd(s1_6, M4(8.346e-03, -1.200e-01, 1.913e-03, -5.262e-02, -3.471e-02, -3.522e-02, 4.409e-02, 7.695e-02, 4.910e-02, -5.825e-02, 1.350e-01, 3.096e-04, 3.797e-03, -9.389e-02, 5.738e-02, -1.094e-02), r);
	r = MulAdd(s1_7, M4(3.179e-01, -2.333e-01, 7.353e-02, -4.401e-01, -1.195e-01, 6.711e-02, -2.217e-02, -1.906e-01, 1.201e-01, -8.349e-02, 4.145e-02, -2.381e-02, 6.768e-02, 1.101e-02, 2.261e-02, -2.036e-01), r);
	r = MulAdd(s1_8, M4(-4.521e-02, -3.368e-01, 2.684e-01, -1.975e-01, -1.879e-02, 1.738e-02, 4.605e-02, 1.067e-01, -6.758e-02, -8.444e-02, 1.229e-01, -6.125e-02, -1.200e-02, -6.803e-03, -6.103e-03, 5.747e-02), r);
	r = MulAdd(s2_0, M4(-1.677e-02, -7.385e-03, 7.314e-03, 6.695e-02, 5.321e-03, -7.504e-03, 1.785e-01, 8.061e-02, 6.187e-02, 2.549e-02, -4.809e-02, -3.195e-02, -1.591e-01, 8.312e-02, -6.793e-02, 3.581e-02), r);
	r = MulAdd(s2_1, M4(3.261e-02, -7.202e-02, -1.190e-01, 1.516e-01, -5.670e-03, 1.967e-01, -4.255e-01, 4.834e-02, 4.823e-02, 4.959e-02, 2.604e-01, -9.989e-02, -4.437e-02, -6.331e-02, 5.512e-02, -4.163e-02), r);
	r = MulAdd(s2_2, M4(4.071e-02, 7.433e-02, 7.196e-02, -3.118e-02, -4.637e-02, -3.775e-02, -2.451e-02, -1.277e-02, 4.582e-02, -4.385e-02, 3.060e-01, -1.481e-02, 3.523e-03, 1.282e-02, -1.091e-01, 6.814e-02), r);
	r = MulAdd(s2_3, M4(2.584e-01, -1.657e-02, -1.307e-01, -1.203e-01, -5.162e-02, 6.113e-04, 6.253e-02, 1.557e-01, -1.242e-01, -5.452e-02, 3.666e-01, -9.119e-02, 5.750e-01, -4.420e-02, -2.222e-01, 3.541e-02), r);
	r = MulAdd(s2_4, M4(-3.262e-02, 1.971e-01, -7.493e-02, -2.959e-01, 1.890e-02, -4.554e-02, -1.241e-01, -2.084e-01, -3.179e-01, -6.226e-02, -3.992e-01, -4.678e-02, 9.481e-02, -2.873e-01, -9.361e-02, -2.487e-01), r);
	r = MulAdd(s2_5, M4(-9.744e-02, -3.792e-02, -1.262e-01, -1.641e-02, -9.649e-02, 2.342e-01, 7.814e-02, -1.527e-01, -2.606e-01, -1.009e-01, 1.738e-02, -1.352e-01, 4.687e-03, -4.251e-03, 8.552e-02, -1.215e-01), r);
	r = MulAdd(s2_6, M4(4.338e-02, -6.728e-02, -2.316e-02, 1.704e-01, -4.922e-02, -2.497e-02, -3.437e-02, -2.233e-02, 8.673e-02, -8.427e-02, 3.012e-01, 1.868e-02, -5.820e-02, 2.842e-02, 1.574e-01, -5.813e-02), r);
	r = MulAdd(s2_7, M4(7.404e-02, -4.425e-02, 8.098e-02, -7.744e-02, -2.555e-02, 7.253e-02, 8.971e-02, -4.607e-02, -1.052e-01, 7.666e-02, 3.011e-02, -3.467e-01, 5.327e-02, -4.348e-02, 2.473e-01, 1.847e-01), r);
	r = MulAdd(s2_8, M4(-1.205e-02, -1.017e-01, 1.288e-02, -6.228e-02, -7.597e-03, 9.311e-03, 2.267e-02, 5.187e-04, 2.936e-02, -1.846e-01, 2.377e-02, 1.072e-01, 7.069e-02, -6.199e-02, 1.657e-01, 1.708e-01), r);
	r = MulAdd(s3_0, M4(-2.337e-01, 2.411e-01, -5.654e-02, -6.128e-02, 1.441e-01, -7.801e-02, 8.768e-02, -1.401e-02, -7.979e-02, 2.938e-02, 3.423e-02, 2.258e-02, 3.470e-02, 1.049e-01, -9.900e-02, -3.849e-02), r);
	r = MulAdd(s3_1, M4(1.636e-01, -5.372e-01, 7.445e-02, 4.422e-01, 2.717e-02, 4.842e-01, -3.236e-01, -1.247e-01, 9.917e-03, 2.162e-02, -1.777e-01, 3.460e-02, -1.702e-01, -4.158e-02, -6.269e-02, 3.410e-03), r);
	r = MulAdd(s3_2, M4(-2.422e-03, -5.821e-02, -8.683e-02, -1.286e-02, 7.737e-02, 1.772e-01, 1.056e-01, -5.362e-02, 3.000e-02, 3.260e-02, -2.047e-02, -2.903e-02, 4.510e-02, 3.030e-02, 3.413e-02, 1.265e-01), r);
	r = MulAdd(s3_3, M4(-3.559e-01, -4.824e-03, 2.601e-01, 5.725e-02, 1.480e-01, 9.340e-02, -6.162e-03, -1.121e-01, 3.200e-02, -2.580e-02, -1.105e-01, 3.647e-02, 1.558e-01, 3.367e-02, -3.009e-01, 2.425e-01), r);
	r = MulAdd(s3_4, M4(-3.534e-01, 1.046e-01, -1.263e-02, -2.606e-01, 1.026e-01, 2.504e-01, -2.282e-01, -1.836e-01, 7.199e-02, -3.665e-02, -2.959e-01, -2.501e-01, 2.980e-02, -1.711e-01, 2.363e-01, -1.904e-01), r);
	r = MulAdd(s3_5, M4(-1.381e-01, 9.220e-02, 1.082e-01, -1.717e-01, 5.453e-02, 1.216e-01, 6.893e-03, 4.137e-02, -3.005e-02, -2.100e-03, -5.898e-02, -3.922e-02, 1.715e-01, -1.886e-01, -2.510e-01, 2.493e-01), r);
	r = MulAdd(s3_6, M4(5.587e-02, -5.724e-02, 4.462e-02, 1.133e-01, 2.672e-02, 1.392e-02, -1.762e-02, 8.027e-02, 1.795e-02, 4.128e-02, -1.986e-02, 1.847e-02, -1.562e-01, 6.650e-02, -1.597e-01, -8.866e-02), r);
	r = MulAdd(s3_7, M4(5.177e-02, -1.843e-01, 6.256e-03, 3.103e-02, -8.057e-03, 1.538e-01, 8.820e-02, 9.421e-02, -1.280e-01, 3.670e-02, 2.720e-03, -7.796e-02, -5.717e-02, -1.237e-01, -1.806e-01, 9.065e-02), r);
	r = MulAdd(s3_8, M4(-4.682e-02, -1.655e-01, 3.621e-02, 1.126e-02, 1.055e-01, 4.565e-02, -4.206e-02, 1.496e-01, 6.181e-02, 5.189e-02, 6.303e-02, -1.095e-01, -1.832e-02, 1.099e-01, 6.317e-03, -2.335e-01), r);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.083e-02, 2.841e-03, 4.056e-03, -3.056e-03 };
	r = MulAdd(s0_0, M4(4.108e-02, 4.397e-02, 8.099e-02, 6.707e-02, -7.574e-03, -3.379e-02, -1.189e-01, 2.469e-02, 5.677e-02, 1.936e-01, -2.147e-01, 2.745e-02, -2.339e-02, 1.413e-01, 9.424e-03, -1.532e-02), r);
	r = MulAdd(s0_1, M4(6.845e-02, 2.566e-01, -3.429e-01, 7.152e-02, 7.412e-03, -1.234e-01, 1.319e-01, -1.394e-01, -1.452e-01, 4.029e-02, -8.428e-02, -9.997e-02, 3.241e-01, 4.534e-01, 1.353e-01, -6.836e-02), r);
	r = MulAdd(s0_2, M4(8.957e-02, 2.015e-01, -9.037e-02, 4.122e-02, -9.043e-02, -2.070e-01, 1.988e-01, -7.000e-02, -1.122e-02, 3.547e-02, 2.630e-03, -6.662e-03, 1.449e-01, 1.636e-01, -4.471e-02, 7.979e-02), r);
	r = MulAdd(s0_3, M4(7.138e-02, 3.893e-02, 7.791e-02, 4.093e-02, -2.462e-01, 3.752e-02, 7.621e-03, -4.290e-02, -1.033e-01, 1.806e-01, -8.490e-03, -1.038e-01, 4.361e-02, 7.536e-02, 7.619e-02, -5.600e-02), r);
	r = MulAdd(s0_4, M4(-2.419e-02, 1.601e-01, 1.182e-01, -6.286e-02, 4.891e-02, 4.624e-02, -9.374e-02, 1.644e-01, -1.319e-01, 3.175e-01, -5.865e-02, -8.696e-02, 5.228e-01, 3.163e-01, -2.053e-01, 1.315e-01), r);
	r = MulAdd(s0_5, M4(2.643e-02, -1.714e-01, -2.308e-02, 2.050e-02, -1.852e-01, -5.479e-02, 6.061e-02, -1.538e-01, -7.382e-02, -4.387e-04, -5.849e-02, -5.184e-02, 1.197e-01, 5.114e-02, -3.098e-02, 1.580e-02), r);
	r = MulAdd(s0_6, M4(3.068e-02, 4.721e-04, 3.968e-02, 3.974e-02, 1.701e-02, -2.658e-02, 4.949e-02, -7.216e-02, -1.259e-01, -6.904e-03, 7.295e-02, 1.948e-02, -3.177e-02, -3.845e-03, 2.910e-02, -6.175e-03), r);
	r = MulAdd(s0_7, M4(3.500e-02, -6.070e-02, -3.675e-02, 5.680e-02, -8.178e-02, -1.257e-01, -1.144e-03, 1.419e-01, 1.017e-01, 6.111e-02, -9.663e-02, 3.145e-01, 1.886e-01, 1.274e-01, -6.735e-02, 1.646e-02), r);
	r = MulAdd(s0_8, M4(1.100e-02, -1.008e-01, -2.527e-02, 9.051e-02, -2.122e-02, -3.495e-02, 2.186e-02, -3.673e-02, -5.042e-02, -4.490e-02, -6.696e-02, -1.370e-01, 8.272e-02, -1.107e-01, 3.828e-02, 2.405e-02), r);
	r = MulAdd(s1_0, M4(4.308e-02, -1.587e-02, 5.034e-02, -1.256e-03, -3.630e-03, 9.342e-02, -1.246e-01, 1.407e-03, 2.283e-01, 7.103e-02, 1.496e-01, 1.529e-03, 7.748e-02, -4.744e-02, 1.267e-01, 8.372e-05), r);
	r = MulAdd(s1_1, M4(-2.667e-01, 4.345e-03, -6.051e-02, 7.748e-04, 5.212e-02, 7.912e-02, -7.013e-02, -4.177e-02, -5.925e-03, 7.525e-03, 1.316e-01, 1.347e-02, -1.046e-01, -3.121e-01, 4.098e-01, -3.102e-02), r);
	r = MulAdd(s1_2, M4(-7.060e-02, 2.314e-01, -9.224e-02, 2.077e-02, -4.781e-03, -7.540e-02, 7.904e-02, -2.268e-02, 4.577e-02, 1.067e-01, -1.948e-03, 4.846e-02, -2.153e-01, -1.920e-01, 7.195e-02, 5.041e-02), r);
	r = MulAdd(s1_3, M4(1.662e-02, 3.623e-03, 9.545e-03, 5.119e-04, -1.384e-01, 9.941e-03, 4.085e-02, 1.023e-02, 8.623e-02, -1.836e-02, 1.508e-01, 2.565e-02, 1.250e-01, -6.873e-02, 1.039e-02, 3.096e-02), r);
	r = MulAdd(s1_4, M4(-3.783e-01, 5.804e-02, 5.745e-02, -1.070e-01, 4.047e-01, 4.005e-01, -2.802e-01, 1.174e-01, -3.899e-01, -2.940e-01, 6.426e-01, -1.456e-01, -2.346e-01, -2.872e-01, 3.019e-02, 2.075e-01), r);
	r = MulAdd(s1_5, M4(-1.279e-01, -5.204e-02, 1.008e-01, -7.402e-02, -1.272e-01, -2.809e-02, 4.025e-02, -4.037e-02, 1.705e-01, 1.347e-01, 4.935e-02, -5.119e-03, -3.623e-01, -2.111e-01, 2.074e-01, -4.637e-01), r);
	r = MulAdd(s1_6, M4(3.894e-02, 1.740e-02, 1.167e-02, -8.472e-02, 2.890e-02, -2.453e-02, 4.209e-02, 1.043e-01, 8.521e-02, -9.523e-02, 6.200e-02, 1.241e-02, 3.387e-03, -2.783e-02, -2.927e-02, -2.315e-02), r);
	r = MulAdd(s1_7, M4(-1.122e-01, -5.652e-02, -4.186e-02, -4.056e-02, 1.860e-02, 8.204e-03, -1.358e-02, 2.056e-01, -6.379e-02, -1.215e-02, 1.135e-01, -2.047e-01, -1.013e-02, 1.194e-02, -1.397e-01, -1.773e-01), r);
	r = MulAdd(s1_8, M4(-2.283e-03, -1.172e-02, 2.030e-02, -3.740e-03, 6.208e-02, -1.690e-02, 7.899e-03, 1.212e-01, 1.094e-01, -4.111e-02, 4.576e-02, 7.825e-02, -6.665e-02, -9.950e-02, 6.104e-03, -1.272e-01), r);
	r = MulAdd(s2_0, M4(5.802e-02, -8.129e-02, 7.649e-02, 8.207e-02, -7.115e-02, -9.510e-02, 2.679e-01, 5.415e-02, 3.331e-02, -5.390e-02, 2.535e-01, -6.838e-02, 1.007e-01, -8.470e-02, 1.052e-01, -4.556e-03), r);
	r = MulAdd(s2_1, M4(-7.709e-02, -2.091e-01, 7.253e-02, 3.751e-02, -3.012e-02, 2.705e-01, 9.391e-02, 5.310e-02, -3.819e-02, -1.502e-01, 3.674e-01, 6.050e-02, 8.643e-02, -3.193e-02, 3.706e-01, -1.272e-01), r);
	r = MulAdd(s2_2, M4(-8.654e-02, -1.264e-01, 9.612e-02, -5.250e-02, 1.367e-02, 2.760e-02, 3.644e-02, 5.628e-02, -2.305e-02, -3.072e-02, 7.435e-02, 1.310e-01, 1.349e-03, -8.390e-02, 8.076e-02, 9.622e-02), r);
	r = MulAdd(s2_3, M4(2.535e-02, 3.247e-02, -3.928e-02, 1.713e-02, -1.630e-01, -4.284e-02, -8.256e-02, 2.143e-01, 1.523e-01, -1.190e-01, 1.579e-01, 1.972e-02, -1.024e-01, -3.809e-02, -1.194e-01, 1.843e-02), r);
	r = MulAdd(s2_4, M4(-2.534e-01, 4.147e-03, 3.652e-02, -2.514e-01, -4.926e-02, 1.540e-01, -6.906e-02, -2.077e-01, -4.119e-02, 6.231e-01, -2.267e-01, -1.284e-01, -1.211e-01, 1.260e-01, -3.930e-02, 1.480e-01), r);
	r = MulAdd(s2_5, M4(-1.185e-01, -1.450e-01, 5.094e-02, -1.236e-01, 5.897e-02, 3.769e-02, 4.215e-02, 4.909e-02, -1.146e-01, 1.742e-02, 2.068e-01, 1.393e-01, 7.795e-02, 1.028e-01, 4.501e-02, 1.306e-01), r);
	r = MulAdd(s2_6, M4(7.121e-03, 1.788e-02, 8.609e-03, 4.098e-02, 9.014e-02, 4.066e-02, -4.107e-02, -5.454e-02, 8.660e-02, 3.154e-02, 1.323e-01, 2.997e-02, 1.011e-02, -7.761e-02, 8.035e-02, 1.981e-02), r);
	r = MulAdd(s2_7, M4(-2.280e-02, -5.927e-02, -4.650e-02, -2.669e-02, 1.326e-02, -6.787e-02, -9.816e-02, -2.418e-01, -2.725e-02, -2.671e-02, 6.625e-02, 1.176e-01, -1.511e-01, -8.170e-02, 8.073e-02, -4.317e-02), r);
	r = MulAdd(s2_8, M4(2.503e-02, -5.313e-02, -5.163e-02, -1.025e-01, 3.311e-03, 5.522e-02, -2.906e-02, 6.220e-02, -1.606e-01, -9.050e-02, 1.435e-01, -7.260e-02, -1.731e-03, -3.100e-02, 6.860e-02, 5.068e-02), r);
	r = MulAdd(s3_0, M4(2.504e-01, 3.584e-02, 4.543e-02, 5.108e-03, 1.660e-01, 3.755e-02, 2.497e-02, 3.799e-02, 1.946e-01, 6.233e-02, 1.666e-02, 2.337e-02, 4.533e-01, -1.371e-01, -3.372e-01, 9.264e-02), r);
	r = MulAdd(s3_1, M4(-2.139e-01, -3.254e-01, 2.315e-01, 1.407e-01, -4.157e-02, 1.958e-01, -6.251e-03, 2.082e-03, -3.687e-02, 7.719e-02, 8.827e-02, 1.692e-02, 2.429e-03, -4.380e-02, -1.665e-01, -4.293e-01), r);
	r = MulAdd(s3_2, M4(2.632e-02, -4.644e-01, 3.778e-01, 1.915e-01, 1.765e-02, 9.826e-02, -5.215e-02, 6.615e-02, 4.066e-02, 8.500e-02, -6.289e-02, 1.127e-01, 4.373e-02, -2.529e-02, -3.463e-03, 1.561e-02), r);
	r = MulAdd(s3_3, M4(1.812e-01, 3.307e-02, -1.222e-01, -1.056e-01, 3.178e-01, -6.752e-02, -4.926e-02, 9.877e-02, -1.413e-02, 2.520e-02, 2.318e-02, -5.239e-02, -1.853e-02, 1.247e-01, -2.800e-01, -4.267e-02), r);
	r = MulAdd(s3_4, M4(-2.170e-01, 9.658e-02, -2.656e-02, -3.532e-01, -2.667e-01, -1.484e-01, 1.986e-01, 3.970e-01, -9.642e-02, 3.251e-01, -4.856e-02, -1.348e-01, -2.365e-01, -7.690e-02, -4.313e-02, -3.293e-01), r);
	r = MulAdd(s3_5, M4(3.116e-01, 3.575e-01, -1.439e-01, -2.788e-01, 1.682e-01, 3.373e-02, 1.132e-03, -2.062e-02, 2.790e-02, 4.237e-02, 6.550e-02, -7.303e-02, 1.220e-01, 4.608e-02, 2.672e-02, -6.588e-02), r);
	r = MulAdd(s3_6, M4(1.361e-01, 1.959e-02, -6.558e-02, -5.488e-02, 1.780e-01, 1.400e-02, -8.174e-02, 3.617e-02, 2.232e-02, 1.570e-02, 2.935e-02, 1.595e-02, 4.166e-02, 1.395e-02, -7.261e-02, 3.343e-02), r);
	r = MulAdd(s3_7, M4(-6.674e-03, -7.837e-02, 2.041e-02, -7.274e-02, -1.975e-01, -1.022e-01, 3.521e-02, -2.381e-01, 4.452e-02, -4.855e-02, -2.357e-02, 4.965e-02, -2.079e-03, -5.777e-02, -4.211e-02, -1.147e-01), r);
	r = MulAdd(s3_8, M4(1.852e-02, -4.532e-02, 3.780e-02, -2.960e-02, 4.216e-03, 7.749e-02, -2.922e-03, 9.211e-02, 1.465e-02, 1.361e-03, -5.097e-02, 2.529e-02, 6.341e-02, 1.534e-02, -2.929e-02, 6.519e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -5.051e-03, -7.374e-04, -9.013e-03, -1.041e-02 };
	r = MulAdd(s0_0, M4(-6.130e-04, -7.656e-02, 1.752e-01, 1.022e-01, 5.050e-03, -7.227e-02, 4.673e-03, -1.316e-01, -8.276e-02, -2.565e-02, 1.057e-01, 4.770e-02, -6.372e-02, -9.303e-02, 3.416e-02, 2.778e-02), r);
	r = MulAdd(s0_1, M4(-1.171e-01, 7.738e-02, 1.872e-01, 2.325e-02, 5.726e-02, 6.929e-02, -1.247e-01, -3.376e-02, -1.218e-01, 1.106e-01, -9.175e-02, 2.578e-02, -1.139e-01, -5.745e-02, -5.009e-03, 1.082e-01), r);
	r = MulAdd(s0_2, M4(-1.007e-01, 7.851e-03, 2.006e-02, 1.214e-01, 8.999e-02, -2.452e-02, -6.921e-02, -1.470e-01, -2.441e-02, -6.812e-02, -3.163e-02, -3.730e-02, -9.304e-02, -6.580e-02, -3.085e-02, 4.557e-02), r);
	r = MulAdd(s0_3, M4(2.147e-02, 2.392e-02, 6.809e-02, 5.823e-02, -5.266e-02, -6.499e-02, -1.937e-01, -1.774e-01, -7.026e-02, 5.565e-02, 2.104e-01, 9.940e-02, -6.656e-02, -5.599e-02, -9.527e-02, -2.195e-02), r);
	r = MulAdd(s0_4, M4(-1.314e-01, -2.100e-02, -8.236e-02, -5.332e-03, -8.806e-02, -1.931e-02, -3.184e-02, 1.283e-01, -1.044e-01, -9.658e-02, -3.773e-02, 4.251e-01, -6.699e-01, -2.441e-01, 2.817e-01, 3.949e-02), r);
	r = MulAdd(s0_5, M4(-6.834e-02, 1.218e-01, -6.189e-02, -1.587e-01, -9.935e-03, 4.393e-02, -2.135e-01, -2.161e-01, 5.552e-03, 5.016e-02, 3.099e-02, -1.251e-01, 2.489e-02, -4.931e-01, 5.038e-03, -8.030e-03), r);
	r = MulAdd(s0_6, M4(3.225e-02, -2.105e-02, 3.418e-03, 1.529e-02, -3.405e-02, 3.200e-02, -1.592e-01, -9.388e-02, 6.723e-02, 3.172e-02, -4.816e-02, 1.309e-01, -1.431e-02, -7.290e-02, -9.302e-02, -8.654e-02), r);
	r = MulAdd(s0_7, M4(4.884e-02, 1.260e-01, 6.116e-02, 8.572e-02, 1.022e-01, 2.291e-02, -7.667e-02, -1.023e-01, -1.076e-01, 2.182e-01, 1.787e-01, 4.600e-01, 4.603e-02, 4.013e-02, -1.248e-01, 1.515e-02), r);
	r = MulAdd(s0_8, M4(2.182e-02, 1.733e-01, 1.738e-01, 1.317e-01, -3.960e-02, 1.821e-02, -6.237e-02, -3.957e-02, -8.466e-02, 3.118e-01, -1.109e-03, 6.180e-02, 1.087e-01, 1.219e-01, 1.164e-01, 7.167e-02), r);
	r = MulAdd(s1_0, M4(-5.654e-03, -5.632e-02, -1.110e-01, -8.149e-02, -4.162e-02, -3.990e-02, 7.565e-02, -5.569e-02, 5.983e-02, 3.346e-02, 1.598e-02, 3.390e-02, 7.153e-02, 7.041e-03, -1.990e-02, 1.044e-01), r);
	r = MulAdd(s1_1, M4(-1.193e-02, -1.392e-01, -6.353e-02, -5.281e-02, -3.663e-02, 4.147e-02, -1.629e-01, -1.972e-02, 4.507e-02, 6.254e-02, -2.663e-02, -2.552e-02, 2.680e-01, 6.236e-02, -5.901e-02, -1.626e-02), r);
	r = MulAdd(s1_2, M4(-1.057e-01, -3.432e-02, 5.563e-02, 1.240e-01, 1.598e-02, -3.795e-02, -1.715e-02, -2.897e-03, 1.671e-02, 7.545e-04, 3.111e-02, 5.136e-02, 1.207e-01, -1.587e-02, 7.899e-02, 3.483e-02), r);
	r = MulAdd(s1_3, M4(-1.056e-02, 8.004e-03, -1.297e-02, -2.314e-02, -4.324e-02, -1.453e-02, -3.458e-02, 7.711e-03, 9.450e-02, -7.279e-03, -1.025e-01, -2.236e-01, 5.674e-02, 6.152e-02, 7.612e-02, 8.618e-03), r);
	r = MulAdd(s1_4, M4(-1.020e-01, -1.119e-01, -4.415e-01, -4.113e-01, -2.564e-01, -8.898e-02, 2.198e-01, 3.662e-01, 3.076e-01, -3.531e-01, -3.432e-02, 2.568e-01, -1.218e-01, 5.504e-02, 2.069e-01, -6.714e-01), r);
	r = MulAdd(s1_5, M4(-5.354e-02, 5.475e-02, -3.443e-01, -2.398e-01, -2.952e-02, 1.826e-02, -3.823e-02, -9.658e-02, -1.836e-02, -7.218e-02, 5.875e-02, -1.017e-01, 2.128e-01, -2.690e-01, -1.918e-01, -2.150e-01), r);
	r = MulAdd(s1_6, M4(9.334e-04, 1.622e-02, 3.458e-02, 6.131e-02, 2.779e-02, 4.021e-03, 4.950e-02, -1.843e-02, 7.233e-02, 2.085e-03, -1.719e-02, 2.032e-02, 4.844e-03, -5.081e-03, 2.610e-02, 1.633e-02), r);
	r = MulAdd(s1_7, M4(-3.670e-02, 5.013e-02, -6.773e-02, -6.609e-02, 1.170e-01, 3.290e-02, -9.547e-02, -2.685e-02, -1.327e-01, -3.630e-03, -2.409e-01, -3.083e-01, 3.003e-02, 1.812e-01, 1.606e-01, 1.490e-01), r);
	r = MulAdd(s1_8, M4(-4.339e-02, 3.640e-03, -4.089e-02, -2.330e-02, 2.747e-02, 1.597e-01, 6.442e-03, 1.263e-01, -1.345e-02, 7.396e-02, 1.475e-02, -2.817e-03, -3.443e-02, 1.270e-01, 8.616e-02, 2.625e-02), r);
	r = MulAdd(s2_0, M4(5.066e-02, 8.901e-02, -5.357e-02, -3.008e-04, 6.639e-02, 4.513e-02, -7.859e-02, 1.288e-01, 1.788e-02, 3.146e-03, -1.870e-01, 8.331e-03, 8.237e-02, 1.556e-02, -4.060e-02, -3.354e-02), r);
	r = MulAdd(s2_1, M4(1.211e-01, -1.958e-02, -1.821e-01, -2.678e-01, 5.985e-02, -3.434e-02, -1.473e-04, -4.211e-02, 9.400e-02, -1.979e-01, -1.631e-01, -1.431e-01, 1.288e-01, 1.301e-01, 7.110e-02, 1.196e-01), r);
	r = MulAdd(s2_2, M4(9.777e-02, 7.517e-02, 5.213e-02, 2.601e-02, 6.137e-04, 2.892e-02, -3.826e-02, 3.445e-02, 2.244e-02, -2.289e-01, -2.827e-02, -1.499e-01, -1.293e-02, -1.242e-01, 7.029e-03, 3.803e-02), r);
	r = MulAdd(s2_3, M4(-7.883e-03, 3.989e-02, 2.103e-01, 9.354e-02, -2.803e-02, -2.155e-02, 1.230e-01, -2.510e-01, -4.577e-02, -8.206e-02, -4.634e-02, -5.687e-02, -4.763e-02, 8.117e-04, 4.716e-02, -6.873e-02), r);
	r = MulAdd(s2_4, M4(5.074e-02, -9.310e-02, -3.064e-01, -2.342e-01, -3.624e-01, 1.487e-01, -1.847e-01, -1.638e-01, -2.098e-01, -2.940e-01, -7.838e-02, -3.450e-01, -1.351e-01, 1.343e-01, -2.153e-01, -3.085e-01), r);
	r = MulAdd(s2_5, M4(-9.094e-02, 1.096e-01, -1.252e-01, -1.611e-01, -1.475e-02, -1.251e-01, -3.660e-02, -9.706e-02, 8.987e-02, -2.422e-01, -1.290e-01, -8.441e-02, 1.325e-01, -1.979e-01, 1.195e-01, 1.066e-01), r);
	r = MulAdd(s2_6, M4(3.450e-02, -1.976e-03, -2.692e-02, 2.470e-02, 4.134e-02, 1.106e-01, 5.587e-03, 1.203e-01, 3.348e-03, -1.976e-01, -6.304e-03, -9.345e-02, 4.034e-03, -2.611e-02, -2.223e-02, -4.153e-02), r);
	r = MulAdd(s2_7, M4(-1.626e-02, 9.840e-02, -8.620e-03, -1.479e-02, -1.601e-01, -1.274e-01, -1.019e-01, -1.193e-01, 1.353e-01, -1.601e-01, -3.038e-01, -2.066e-01, 4.585e-02, -7.649e-02, -1.581e-01, -1.694e-01), r);
	r = MulAdd(s2_8, M4(-6.049e-02, -7.053e-02, 6.931e-03, -1.654e-02, 1.932e-02, 2.112e-02, 6.621e-02, 7.266e-02, -5.272e-02, -5.286e-02, -2.885e-01, -2.012e-01, 5.525e-02, -1.721e-02, -4.575e-02, -9.210e-04), r);
	r = MulAdd(s3_0, M4(-2.271e-02, 2.668e-02, -1.480e-01, -2.625e-02, 9.554e-03, 6.925e-02, -1.252e-01, 2.866e-02, 1.177e-02, -4.071e-02, -2.070e-02, -2.731e-02, 8.067e-02, 1.324e-01, 5.591e-02, 1.092e-01), r);
	r = MulAdd(s3_1, M4(2.863e-01, -3.245e-02, -2.369e-01, -2.863e-01, -2.117e-03, -4.875e-02, 3.885e-02, -1.088e-01, 3.018e-02, -7.956e-02, 1.067e-01, 4.768e-03, 2.319e-01, 4.443e-01, 1.081e-01, -2.955e-01), r);
	r = MulAdd(s3_2, M4(2.579e-01, -1.163e-01, -4.524e-02, 1.974e-01, -3.204e-02, 2.183e-02, -9.418e-03, 1.514e-03, -7.422e-02, -8.652e-02, -1.831e-02, 7.031e-02, 1.678e-02, -6.474e-02, 3.298e-03, 1.898e-01), r);
	r = MulAdd(s3_3, M4(-1.287e-01, 6.977e-02, 2.034e-01, 1.203e-01, -1.577e-01, -1.159e-01, 5.411e-01, 1.079e-01, -5.869e-02, -9.771e-02, 4.457e-02, 1.618e-02, -1.096e-01, 7.700e-02, 1.746e-01, -4.051e-02), r);
	r = MulAdd(s3_4, M4(-2.308e-01, -1.267e-01, -1.459e-01, 5.467e-03, -1.625e-01, 2.188e-01, -1.188e-01, -9.377e-03, -2.440e-01, -8.329e-02, -1.366e-01, 4.782e-02, -3.538e-01, 1.737e-01, -1.531e-01, -1.443e-01), r);
	r = MulAdd(s3_5, M4(-4.365e-01, 5.178e-01, -2.166e-01, -2.063e-01, 3.981e-02, -1.054e-02, 5.701e-03, -4.639e-02, 4.680e-03, 2.433e-02, -3.099e-02, -1.344e-01, -4.936e-03, -2.510e-02, 9.593e-02, 6.336e-03), r);
	r = MulAdd(s3_6, M4(-9.464e-03, 6.306e-02, 9.266e-02, 1.239e-01, 9.411e-02, -8.883e-03, 1.447e-02, 3.888e-02, 3.209e-03, -4.987e-02, -7.248e-02, -4.528e-02, 3.389e-02, 3.453e-02, 4.363e-02, -1.139e-02), r);
	r = MulAdd(s3_7, M4(-1.101e-01, -7.831e-02, -6.882e-02, -4.597e-02, -2.022e-01, -1.681e-01, -8.124e-02, -1.088e-01, 7.872e-02, 5.759e-02, 6.814e-02, 9.876e-02, -3.474e-02, 5.673e-02, 1.178e-01, 1.018e-01), r);
	r = MulAdd(s3_8, M4(-4.424e-02, -1.753e-01, -1.880e-01, -1.999e-01, 1.189e-02, 2.421e-02, 7.007e-03, -2.655e-02, -3.715e-02, 6.751e-02, -7.333e-02, -7.710e-02, 2.700e-02, 2.248e-02, 9.982e-02, 5.511e-02), r);
	return r;
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -8.722e-04, 4.321e-03, -2.282e-03, 5.775e-04 };
	r = MulAdd(s0_0, M4(-3.543e-02, 7.950e-02, 4.966e-02, -1.433e-03, 8.905e-03, -6.500e-02, 3.529e-02, -6.468e-03, -9.021e-04, -2.890e-03, 2.941e-02, 1.722e-02, -9.993e-02, -4.554e-02, 1.116e-01, 4.605e-03), r);
	r = MulAdd(s0_1, M4(-1.089e-01, -6.759e-02, 5.507e-02, 2.826e-02, 3.102e-02, 4.981e-03, -6.957e-03, -2.970e-02, 3.893e-02, -1.985e-02, 3.482e-03, -3.115e-03, 6.911e-02, -1.665e-01, -3.388e-02, -1.252e-02), r);
	r = MulAdd(s0_2, M4(1.046e-01, 2.638e-02, -8.177e-02, -9.476e-02, -2.313e-02, 9.290e-03, -1.408e-03, 4.678e-02, -1.106e-01, 4.373e-02, 4.853e-02, 4.161e-02, 2.201e-02, -4.520e-02, 1.641e-02, 1.639e-03), r);
	r = MulAdd(s0_3, M4(-3.788e-02, -1.936e-01, 4.114e-02, 1.067e-02, 2.481e-02, 1.257e-02, -2.408e-02, 9.182e-02, -1.143e-02, -2.779e-02, -5.009e-02, -3.573e-02, -4.170e-01, -3.232e-01, 1.870e-01, 1.240e-02), r);
	r = MulAdd(s0_4, M4(-2.012e-01, 3.719e-03, 4.258e-01, 1.357e-01, 1.017e-01, -9.143e-02, 1.062e-01, -2.157e-01, 2.065e-01, -1.497e-01, 2.811e-01, -1.238e-01, 4.285e-01, -1.183e-03, -2.368e-01, 3.175e-01), r);
	r = MulAdd(s0_5, M4(2.946e-01, 5.902e-02, 3.874e-02, -9.653e-02, -9.581e-02, 2.797e-02, -5.432e-02, -3.276e-03, -2.409e-01, 4.022e-02, 6.466e-03, -1.014e-01, 8.373e-02, 1.670e-02, 4.462e-02, 8.036e-02), r);
	r = MulAdd(s0_6, M4(-8.663e-02, -2.484e-02, 3.960e-02, -4.742e-03, 5.179e-02, 8.916e-02, -2.668e-02, -2.848e-02, 9.887e-02, 1.283e-02, -2.283e-02, -8.960e-02, -1.223e-01, -1.672e-02, 6.586e-02, 2.669e-02), r);
	r = MulAdd(s0_7, M4(-2.569e-01, 1.932e-03, 1.442e-01, -1.947e-04, 1.002e-01, 9.362e-02, 1.206e-01, 5.726e-02, -3.269e-02, 1.274e-01, 7.910e-02, -1.930e-01, 9.277e-02, 2.994e-02, -4.800e-02, 8.016e-02), r);
	r = MulAdd(s0_8, M4(9.842e-02, 1.108e-01, -1.303e-01, -3.840e-01, -4.456e-02, -6.372e-02, -6.192e-02, 4.026e-02, -8.011e-02, -7.656e-02, -2.155e-02, 1.167e-02, 4.969e-02, -8.967e-03, -1.639e-02, 6.565e-02), r);
	r = MulAdd(s1_0, M4(-3.505e-03, 5.798e-02, -1.063e-02, 1.098e-02, -5.275e-02, -1.071e-02, 9.760e-02, -4.337e-02, 2.637e-02, -4.918e-02, 1.090e-03, -1.405e-02, -6.812e-02, -5.049e-02, 1.111e-01, -3.104e-02), r);
	r = MulAdd(s1_1, M4(-6.327e-02, 8.466e-03, 1.248e-01, 2.211e-03, 7.483e-03, 6.452e-02, -1.087e-01, 3.269e-02, 2.075e-02, -1.408e-01, 3.009e-02, -4.146e-02, 1.034e-01, 1.109e-01, -9.252e-02, -1.611e-02), r);
	r = MulAdd(s1_2, M4(7.447e-02, 4.614e-02, -6.169e-02, -2.029e-02, -7.331e-02, -3.250e-02, 7.219e-02, 1.042e-02, -2.912e-02, 4.549e-02, -1.047e-02, 2.094e-03, -5.769e-02, -1.615e-02, 8.071e-03, 3.771e-02), r);
	r = MulAdd(s1_3, M4(-4.766e-02, -1.712e-01, 8.735e-02, -5.360e-02, -1.975e-01, -2.133e-01, 8.035e-02, 5.819e-02, -7.093e-02, -2.900e-01, -3.236e-02, -2.934e-02, -3.232e-01, -1.972e-01, 2.268e-01, 1.010e-01), r);
	r = MulAdd(s1_4, M4(-2.210e-01, -1.157e-01, 2.817e-01, 5.840e-02, 2.098e-01, -8.874e-02, 7.924e-02, -2.510e-01, 1.030e-01, 1.279e-01, 6.653e-02, 1.219e-01, 4.141e-01, 2.998e-01, -9.834e-02, 1.607e-01), r);
	r = MulAdd(s1_5, M4(1.294e-01, -1.988e-02, -1.361e-01, 2.103e-01, -1.155e-01, -2.235e-02, 6.869e-02, -8.075e-02, -4.372e-03, 5.632e-02, -7.754e-02, 7.795e-02, -1.595e-01, -1.241e-01, -2.547e-03, 9.130e-02), r);
	r = MulAdd(s1_6, M4(-4.901e-02, -4.760e-02, 1.173e-02, -6.623e-02, -2.803e-01, -1.136e-01, 2.054e-01, -7.351e-03, -1.322e-01, -2.660e-01, 3.918e-02, -8.464e-02, -1.010e-01, 3.265e-02, 6.085e-02, 7.092e-02), r);
	r = MulAdd(s1_7, M4(-1.737e-01, 5.872e-02, 1.029e-01, -1.338e-01, 1.971e-02, -1.182e-03, 6.404e-02, 1.520e-01, 2.388e-01, -3.107e-02, -1.075e-01, -1.224e-01, 1.625e-01, -7.233e-03, -6.741e-02, 2.098e-01), r);
	r = MulAdd(s1_8, M4(1.496e-01, 7.058e-02, -7.112e-02, -7.923e-03, -8.993e-02, -9.366e-02, -1.300e-02, 1.920e-01, 1.403e-01, 1.831e-01, 1.431e-01, -2.287e-01, 2.402e-02, 5.445e-03, 2.004e-02, 4.421e-02), r);
	r = MulAdd(s2_0, M4(2.048e-02, -2.281e-01, -6.323e-03, -3.437e-02, -4.873e-03, -1.313e-01, 9.471e-02, -1.031e-01, 8.325e-02, 9.654e-02, -6.140e-02, 3.037e-02, -1.616e-01, -3.013e-02, 1.615e-01, 6.894e-03), r);
	r = MulAdd(s2_1, M4(-1.950e-01, 3.048e-01, -1.785e-01, 1.902e-01, 9.630e-02, 2.068e-01, -1.684e-01, -1.342e-01, 4.692e-02, 6.126e-02, -7.899e-02, 9.330e-02, 3.255e-02, 3.098e-01, -2.201e-01, 4.671e-02), r);
	r = MulAdd(s2_2, M4(1.306e-01, 5.318e-02, -5.240e-02, 2.361e-02, 2.423e-02, 2.361e-03, -1.695e-02, 1.569e-02, -7.172e-02, -3.968e-02, 1.209e-01, 7.186e-02, 5.879e-02, 1.591e-02, -1.891e-02, -2.890e-02), r);
	r = MulAdd(s2_3, M4(2.569e-01, 4.588e-01, -2.766e-01, 7.439e-02, -2.793e-01, -1.519e-01, 7.798e-02, 2.029e-02, 2.249e-01, 2.073e-01, -2.353e-01, 3.969e-02, -3.564e-01, -1.221e-01, 2.605e-01, -1.442e-02), r);
	r = MulAdd(s2_4, M4(-5.101e-01, -1.367e-01, 1.769e-02, -8.440e-01, 5.105e-02, 1.282e-01, 7.769e-02, 9.786e-02, -2.935e-02, 8.851e-02, 6.132e-01, 3.093e-01, -1.629e-01, -7.917e-02, -3.350e-01, 1.921e-01), r);
	r = MulAdd(s2_5, M4(1.598e-01, 7.845e-02, 1.064e-01, 2.970e-03, 3.521e-02, 5.990e-02, 7.355e-03, 2.954e-02, -1.850e-01, 2.212e-01, -3.574e-02, -5.264e-01, 2.390e-01, 2.798e-01, 7.917e-03, -4.841e-02), r);
	r = MulAdd(s2_6, M4(1.539e-01, 6.461e-02, -2.001e-02, 8.195e-03, -1.185e-01, -7.702e-02, 4.283e-02, -1.233e-01, 1.809e-01, 1.459e-02, -5.636e-02, -1.972e-02, -9.290e-02, 1.186e-01, 3.799e-02, 6.921e-02), r);
	r = MulAdd(s2_7, M4(-1.350e-01, -1.079e-01, 3.874e-02, -2.217e-02, 7.493e-02, 1.194e-01, 3.591e-02, -1.228e-01, -4.547e-02, -2.423e-02, 6.285e-02, -9.256e-02, 6.649e-02, 6.761e-03, -1.018e-01, 1.880e-01), r);
	r = MulAdd(s2_8, M4(1.154e-02, 5.337e-03, -8.154e-03, -4.658e-02, 5.640e-03, -1.535e-02, 3.469e-02, -3.478e-02, -1.819e-01, -1.338e-01, 2.321e-02, 2.736e-01, 1.002e-01, 5.359e-02, 5.718e-02, -6.956e-03), r);
	r = MulAdd(s3_0, M4(6.467e-02, -1.265e-01, -5.698e-03, -7.443e-03, -1.703e-02, 4.604e-02, 2.747e-02, -1.545e-02, 1.014e-01, 2.919e-02, -5.269e-02, 4.818e-02, -8.014e-02, 5.749e-02, 3.776e-02, 7.501e-03), r);
	r = MulAdd(s3_1, M4(-1.627e-01, -3.031e-02, 3.391e-02, 4.149e-05, 6.013e-02, 8.038e-02, 1.545e-02, 1.997e-02, -1.785e-02, -3.410e-01, 1.219e-01, 1.513e-02, -7.337e-02, -2.446e-02, -7.831e-03, 2.064e-02), r);
	r = MulAdd(s3_2, M4(5.945e-02, -7.145e-02, -4.252e-03, -1.227e-02, -4.884e-02, -2.366e-02, 4.214e-04, 4.615e-02, -3.871e-02, 3.228e-02, -7.452e-03, 3.501e-02, 9.101e-02, 3.086e-02, -8.161e-02, 4.753e-02), r);
	r = MulAdd(s3_3, M4(1.265e-01, 5.028e-02, -2.140e-02, 6.421e-02, -4.819e-03, -1.641e-01, -3.700e-02, -6.100e-02, 1.873e-01, 2.147e-01, -2.009e-01, 1.324e-03, -7.643e-02, -1.167e-01, 1.724e-01, 2.491e-02), r);
	r = MulAdd(s3_4, M4(-4.396e-01, -8.640e-02, -1.586e-01, -1.634e-01, 2.135e-01, 2.594e-01, 3.525e-01, -4.384e-01, 1.224e-01, 1.559e-02, 9.960e-02, -4.443e-02, -2.066e-01, 5.257e-02, -1.255e-01, 1.136e-01), r);
	r = MulAdd(s3_5, M4(1.476e-01, 1.642e-02, 8.563e-02, -2.731e-02, -9.205e-02, -8.172e-03, -9.883e-03, 4.732e-02, -1.877e-01, -4.119e-02, -3.235e-02, 1.874e-02, 1.632e-01, 3.157e-02, -7.976e-02, 6.713e-02), r);
	r = MulAdd(s3_6, M4(5.719e-02, 2.829e-03, -3.028e-02, -3.731e-02, -1.195e-01, -2.658e-02, 7.224e-02, -6.579e-03, 7.836e-02, -5.193e-02, -2.216e-02, -3.423e-02, -7.006e-02, 1.208e-02, 9.662e-04, 3.652e-03), r);
	r = MulAdd(s3_7, M4(-1.694e-01, 5.317e-02, 1.652e-01, -4.481e-02, 7.641e-04, 2.080e-02, 1.120e-02, 3.510e-02, 7.017e-03, -1.044e-01, 4.685e-03, 4.015e-02, 1.265e-01, 1.437e-01, -9.223e-02, -6.538e-02), r);
	r = MulAdd(s3_8, M4(2.532e-02, -4.911e-02, -7.204e-02, -3.564e-02, 9.314e-03, 5.000e-02, 8.495e-02, -5.174e-02, -1.079e-01, -5.239e-02, -3.180e-02, 5.520e-02, 2.376e-02, 3.436e-02, 7.244e-02, -9.291e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 2.664e-03, 8.117e-03, -7.210e-03, -3.509e-03 };
	r = MulAdd(s0_0, M4(7.753e-02, -5.337e-02, -2.432e-02, 3.644e-02, 1.594e-03, 2.936e-02, -2.457e-02, 9.446e-03, -1.292e-03, 1.233e-02, -3.333e-02, -1.896e-02, 2.404e-02, -8.731e-04, -1.313e-02, -5.993e-02), r);
	r = MulAdd(s0_1, M4(-1.016e-03, 1.421e-01, -1.392e-01, -8.620e-02, 2.236e-02, -5.276e-02, -9.275e-03, 1.497e-01, 5.114e-02, 1.802e-01, 2.670e-02, 1.683e-02, -8.748e-04, -1.293e-01, 2.992e-01, 2.780e-01), r);
	r = MulAdd(s0_2, M4(-7.056e-02, 2.695e-02, -1.473e-01, 3.070e-02, -3.678e-02, 5.208e-02, 2.452e-03, -5.092e-02, -8.107e-02, 1.028e-01, -9.323e-02, -1.948e-01, -1.106e-01, -1.169e-01, 6.257e-02, 4.575e-02), r);
	r = MulAdd(s0_3, M4(-1.961e-01, 7.539e-02, -8.065e-02, -6.256e-02, -1.814e-01, 8.188e-02, -1.757e-02, -9.656e-02, -4.720e-02, -3.039e-02, 1.786e-03, -2.460e-02, 6.616e-02, 1.227e-02, -1.082e-01, 2.060e-01), r);
	r = MulAdd(s0_4, M4(1.100e-02, 1.185e-01, -3.507e-01, 1.974e-01, 8.355e-03, -1.227e-01, 3.761e-01, 3.390e-01, 8.516e-04, -1.056e-01, 2.269e-01, -2.299e-01, -3.527e-01, -3.526e-01, -7.157e-02, 6.354e-02), r);
	r = MulAdd(s0_5, M4(-9.008e-02, 4.770e-01, -9.990e-02, 9.218e-02, 7.512e-02, -1.304e-01, 9.839e-02, 9.985e-03, 1.453e-01, -3.162e-02, -4.388e-02, 7.887e-02, 3.180e-03, -9.248e-02, 1.566e-02, 5.009e-03), r);
	r = MulAdd(s0_6, M4(3.981e-02, -3.872e-02, 6.997e-02, 2.283e-02, -3.285e-02, 5.390e-02, -2.957e-02, -8.100e-02, 1.132e-02, 6.093e-02, -5.962e-02, 2.166e-02, -3.606e-02, -1.190e-03, 4.312e-03, 4.615e-02), r);
	r = MulAdd(s0_7, M4(-2.725e-01, 3.443e-02, 9.929e-02, -9.112e-03, 4.324e-02, 1.421e-01, 2.795e-02, 1.846e-03, 1.279e-01, -1.918e-02, -4.382e-02, 1.812e-01, -4.804e-02, 9.204e-04, -8.716e-02, 4.059e-02), r);
	r = MulAdd(s0_8, M4(4.227e-02, -1.239e-01, -6.485e-02, 8.795e-02, 7.347e-02, 1.209e-01, 3.079e-03, -5.941e-02, -1.296e-01, 1.148e-01, -5.571e-02, -1.353e-01, -7.367e-02, -1.618e-02, 6.989e-03, -2.546e-02), r);
	r = MulAdd(s1_0, M4(-6.146e-02, -4.772e-02, -9.417e-03, -4.165e-02, 2.001e-01, 4.540e-02, -4.097e-02, 3.659e-02, 2.014e-02, -6.690e-02, 4.933e-02, 8.231e-02, -9.543e-03, -3.356e-02, -2.012e-02, -5.326e-02), r);
	r = MulAdd(s1_1, M4(1.254e-01, 1.032e-01, -7.584e-02, -1.386e-01, -1.665e-01, -4.887e-03, 3.620e-02, 6.679e-02, 3.288e-02, 3.521e-03, 4.558e-02, 1.208e-01, 5.012e-02, 4.727e-02, -2.972e-02, 1.239e-01), r);
	r = MulAdd(s1_2, M4(1.979e-03, -9.131e-03, 1.800e-02, 3.867e-02, 2.082e-02, -2.502e-02, -9.056e-02, -9.712e-02, -8.861e-02, 4.792e-02, -2.188e-02, -5.227e-02, 4.840e-02, 7.105e-02, -7.533e-03, -1.297e-01), r);
	r = MulAdd(s1_3, M4(1.216e-01, 8.527e-02, -2.377e-02, 4.553e-02, -3.099e-01, -8.314e-02, 9.121e-02, -1.404e-01, 3.008e-02, 2.271e-02, 2.118e-02, 1.255e-01, 6.427e-02, 1.129e-02, -8.356e-02, 1.902e-01), r);
	r = MulAdd(s1_4, M4(-5.332e-01, 2.151e-01, -2.231e-01, -6.729e-02, 3.100e-01, -5.922e-01, 4.658e-01, 4.307e-01, -2.838e-01, 1.628e-01, -2.627e-02, -3.515e-01, -3.412e-01, -8.481e-01, -1.145e-01, 4.690e-01), r);
	r = MulAdd(s1_5, M4(-4.161e-01, 2.098e-02, -1.141e-01, -1.561e-02, -5.537e-02, -3.509e-01, 1.350e-01, 2.163e-01, 2.016e-01, -1.842e-01, -4.780e-02, 2.205e-01, -8.208e-02, -1.956e-01, -4.500e-02, -2.913e-01), r);
	r = MulAdd(s1_6, M4(-5.157e-03, -9.563e-03, 3.716e-02, 2.411e-02, 2.625e-01, 2.386e-02, 5.859e-02, 1.699e-01, 1.736e-01, -4.062e-02, 1.283e-01, 2.319e-01, 4.247e-02, 6.236e-02, -3.562e-02, -9.159e-02), r);
	r = MulAdd(s1_7, M4(-1.881e-01, -8.441e-02, -4.476e-03, -2.962e-03, -1.410e-01, -1.377e-01, -1.603e-01, 2.566e-01, -1.171e-01, -2.909e-01, -7.991e-02, 3.955e-03, -1.577e-01, 8.067e-02, 3.065e-02, -7.451e-02), r);
	r = MulAdd(s1_8, M4(3.017e-02, 5.592e-02, -1.845e-02, 8.121e-02, -1.605e-01, -1.450e-02, 6.103e-02, 5.926e-02, 1.401e-02, 7.801e-02, -1.884e-01, 1.805e-01, 1.250e-01, -4.373e-03, -2.044e-02, 7.077e-03), r);
	r = MulAdd(s2_0, M4(-2.586e-02, -5.334e-02, -4.608e-02, 1.800e-01, 5.117e-02, 7.026e-02, -1.504e-01, 1.845e-01, 5.936e-02, -1.167e-01, 1.007e-01, 1.775e-02, 9.509e-02, 1.031e-01, -4.267e-02, -4.285e-02), r);
	r = MulAdd(s2_1, M4(-2.268e-01, -3.782e-01, -1.416e-01, 1.976e-02, 7.927e-02, -3.233e-01, 4.022e-02, 1.397e-01, -1.285e-01, -2.301e-02, 2.178e-01, -3.169e-01, -1.043e-01, 1.333e-01, 6.284e-02, -2.571e-01), r);
	r = MulAdd(s2_2, M4(1.330e-01, -2.228e-01, -2.477e-02, 1.212e-01, 1.048e-01, -3.967e-02, 6.519e-02, -3.451e-02, 2.996e-03, -8.184e-02, -3.909e-02, -4.447e-02, 1.670e-01, -4.006e-02, 5.001e-02, 7.491e-03), r);
	r = MulAdd(s2_3, M4(-1.906e-01, 2.986e-02, -3.952e-02, -2.861e-01, 1.208e-01, 2.352e-01, 2.821e-02, -4.515e-02, 2.386e-02, -1.363e-01, 5.457e-02, 1.751e-02, 2.567e-01, 4.067e-02, -3.435e-02, 1.811e-01), r);
	r = MulAdd(s2_4, M4(5.372e-01, 4.567e-01, 1.075e-02, -4.759e-02, 5.753e-02, 2.359e-01, -1.344e-01, -3.942e-01, 1.139e-01, 2.329e-01, 7.097e-03, -2.416e-01, -3.146e-01, -5.996e-01, -8.141e-05, 2.811e-01), r);
	r = MulAdd(s2_5, M4(1.261e-02, 1.636e-01, -1.048e-01, 1.364e-02, 3.740e-03, 5.542e-02, -5.258e-02, -1.429e-01, 2.881e-01, -4.462e-01, 1.270e-02, -7.021e-03, -4.951e-02, -2.034e-03, 4.602e-02, 1.214e-01), r);
	r = MulAdd(s2_6, M4(-6.033e-02, 1.222e-02, 3.018e-03, -1.194e-01, 1.189e-01, -4.994e-02, 6.138e-02, 6.122e-02, -2.687e-02, 2.007e-02, -7.344e-04, -3.446e-02, 6.520e-02, -1.479e-02, -3.500e-02, -2.706e-02), r);
	r = MulAdd(s2_7, M4(1.186e-01, -6.526e-02, 1.112e-01, 3.259e-02, 4.759e-02, -1.519e-01, 6.760e-02, 2.415e-02, -2.772e-02, -9.989e-02, 3.411e-02, -2.095e-01, 8.407e-02, 2.159e-01, -4.676e-02, -1.920e-02), r);
	r = MulAdd(s2_8, M4(8.961e-02, 5.177e-03, 2.837e-02, 8.839e-02, 5.019e-03, -2.271e-02, -2.850e-02, -6.803e-02, 8.845e-02, 1.518e-01, 1.899e-01, -3.774e-02, 3.282e-02, -5.420e-03, -3.044e-02, -8.935e-02), r);
	r = MulAdd(s3_0, M4(-6.317e-02, -8.128e-02, 2.424e-02, 7.323e-02, 8.362e-02, 1.499e-01, -9.427e-02, -4.817e-02, -5.095e-02, -4.980e-02, 1.289e-02, 3.431e-02, 1.232e-01, 9.105e-02, -2.362e-03, -6.421e-02), r);
	r = MulAdd(s3_1, M4(-3.092e-02, -2.956e-01, 1.516e-01, 4.510e-02, 9.056e-04, 8.930e-02, -6.518e-02, -9.671e-03, -3.079e-02, -1.072e-01, 1.636e-01, 8.242e-02, -1.800e-02, 9.257e-02, -2.550e-02, -7.577e-02), r);
	r = MulAdd(s3_2, M4(2.229e-02, -1.460e-01, -6.113e-03, 1.165e-01, 2.291e-02, -2.013e-02, 5.773e-02, -8.558e-02, -1.616e-01, -2.072e-01, 1.440e-01, 6.408e-02, -4.857e-02, 2.835e-02, 1.128e-01, 1.248e-02), r);
	r = MulAdd(s3_3, M4(-7.688e-02, -2.559e-02, 5.337e-02, -1.297e-01, -1.065e-01, 3.964e-03, 2.806e-02, -8.071e-03, -7.883e-02, -1.012e-01, 6.138e-02, -1.165e-01, -3.865e-02, 8.460e-02, -8.840e-02, 1.374e-02), r);
	r = MulAdd(s3_4, M4(-6.546e-02, 3.511e-01, 2.069e-02, 5.183e-01, 3.118e-01, 4.113e-02, -2.531e-01, 1.565e-01, -7.322e-03, 1.533e-01, 7.628e-02, -2.852e-01, 8.712e-02, -2.433e-01, 7.957e-02, 4.118e-01), r);
	r = MulAdd(s3_5, M4(-1.812e-01, -3.087e-02, -4.874e-02, -4.923e-02, -9.320e-03, -7.889e-02, -8.610e-02, -4.763e-02, 8.360e-02, -2.848e-02, -1.165e-02, -6.383e-02, -9.233e-02, 1.244e-01, -4.966e-02, -2.979e-02), r);
	r = MulAdd(s3_6, M4(-2.472e-02, 4.788e-02, -2.114e-02, -1.386e-02, -7.752e-02, 6.096e-02, 3.628e-02, -6.224e-02, 6.418e-02, -5.069e-02, 1.566e-02, 5.723e-02, -1.027e-01, 7.279e-02, -7.078e-02, -3.996e-02), r);
	r = MulAdd(s3_7, M4(1.666e-02, -1.710e-03, 6.469e-04, 6.369e-03, 2.621e-02, 2.647e-02, 1.027e-01, 7.724e-02, -1.024e-01, -8.304e-02, 1.842e-02, -5.779e-02, 1.711e-01, 8.228e-02, -1.305e-01, 8.329e-02), r);
	r = MulAdd(s3_8, M4(-2.309e-02, 2.703e-02, -2.298e-02, -7.029e-03, 6.323e-02, 6.507e-02, -4.531e-02, -1.340e-02, -7.977e-02, -4.942e-02, 4.605e-02, 9.001e-02, 8.840e-02, 8.368e-02, -5.912e-02, -1.229e-01), r);
	return r;
}

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 1.496e-03, 1.976e-03, -2.228e-03, 2.339e-03 };
	r = MulAdd(s0_0, M4(-7.726e-02, -4.236e-02, 2.663e-02, 7.717e-02, 1.578e-02, -3.045e-02, 1.833e-03, 4.928e-03, 2.055e-02, 1.234e-03, 1.293e-02, 8.221e-03, -2.347e-02, 4.309e-02, 7.447e-02, 4.002e-03), r);
	r = MulAdd(s0_1, M4(2.106e-02, 1.496e-01, 2.554e-02, 9.246e-02, -2.261e-01, 9.247e-02, 2.955e-02, 3.709e-02, -9.295e-02, 1.226e-02, -2.886e-02, -6.831e-04, -1.196e-01, 2.725e-01, -2.178e-01, -1.039e-01), r);
	r = MulAdd(s0_2, M4(3.396e-02, -8.075e-03, -6.267e-02, -4.931e-02, 1.142e-02, -4.036e-02, 4.313e-02, 9.018e-02, 2.037e-02, 2.323e-03, 5.455e-02, 7.263e-02, 6.421e-02, -3.649e-02, 3.168e-01, 8.844e-02), r);
	r = MulAdd(s0_3, M4(-5.285e-03, -4.309e-02, -7.397e-02, -5.091e-02, 1.711e-02, 9.498e-02, 1.229e-02, 3.768e-02, 3.743e-03, 2.677e-02, -1.260e-02, 3.233e-02, -1.630e-01, -2.482e-02, -3.347e-03, -6.632e-03), r);
	r = MulAdd(s0_4, M4(-1.568e-01, -9.216e-02, -1.668e-02, -4.316e-03, -1.176e-02, 3.826e-02, -5.394e-02, 6.991e-02, 1.248e-01, 2.796e-02, -1.266e-01, -2.623e-02, -1.338e-01, -4.132e-01, 4.030e-02, 5.239e-02), r);
	r = MulAdd(s0_5, M4(-8.780e-02, 5.905e-02, 3.941e-03, 7.738e-02, 1.064e-01, 1.930e-02, -9.021e-03, 3.723e-02, -1.666e-01, -1.172e-03, -3.216e-01, 6.315e-02, -5.310e-02, -3.348e-02, -7.481e-02, 4.842e-02), r);
	r = MulAdd(s0_6, M4(-1.376e-02, 2.783e-02, 1.790e-02, 3.019e-02, -1.654e-02, 5.290e-03, -2.644e-02, 9.440e-03, -1.162e-02, -1.778e-02, -9.626e-03, -1.727e-02, 6.551e-03, 3.601e-02, -7.306e-03, 9.867e-03), r);
	r = MulAdd(s0_7, M4(7.459e-03, -1.180e-01, -2.086e-02, 7.975e-03, 3.646e-02, 1.958e-01, 3.334e-02, 1.391e-02, -1.209e-01, 2.599e-03, 7.582e-02, 2.697e-03, 3.305e-02, 3.455e-02, -5.884e-02, 2.060e-02), r);
	r = MulAdd(s0_8, M4(6.161e-02, -2.655e-02, 3.691e-02, -2.139e-02, -9.580e-02, -1.344e-02, -8.940e-04, 1.652e-02, 4.274e-02, 4.742e-02, 1.043e-01, 6.114e-03, -2.154e-02, 3.635e-03, -7.209e-02, 2.929e-03), r);
	r = MulAdd(s1_0, M4(-5.330e-02, 2.271e-02, -2.661e-02, 5.278e-02, 1.092e-02, -1.082e-02, 4.099e-04, 4.704e-02, -1.445e-02, -1.413e-02, 3.703e-02, -4.442e-02, -6.642e-02, 4.103e-03, 5.965e-02, 4.796e-03), r);
	r = MulAdd(s1_1, M4(-9.926e-04, 2.275e-01, 5.207e-03, 4.746e-02, -2.941e-01, -5.993e-02, 1.567e-02, 1.336e-01, -1.455e-01, 1.303e-01, -3.239e-02, -1.908e-03, 5.867e-02, 8.483e-02, -1.097e-01, -1.861e-01), r);
	r = MulAdd(s1_2, M4(1.195e-02, 1.569e-02, 1.077e-01, -1.050e-03, 1.581e-01, 3.738e-03, -6.575e-02, -1.782e-01, -5.437e-04, -3.055e-04, 1.013e-01, 1.508e-01, 9.356e-03, -1.633e-02, 1.667e-01, 8.674e-02), r);
	r = MulAdd(s1_3, M4(-1.561e-02, 5.396e-02, -1.401e-02, -3.718e-02, 8.327e-02, 8.406e-02, 3.457e-02, -3.943e-02, 5.426e-02, 4.467e-02, -1.367e-02, -1.917e-02, -1.209e-01, -6.920e-02, -7.704e-02, -9.114e-03), r);
	r = MulAdd(s1_4, M4(-4.008e-02, -7.207e-01, -7.651e-03, -2.095e-01, -5.950e-03, 1.458e-02, -8.664e-02, 2.716e-01, 3.749e-01, -2.539e-01, -1.156e-01, -5.679e-02, 3.828e-02, -2.838e-01, -7.992e-02, 2.495e-01), r);
	r = MulAdd(s1_5, M4(1.640e-02, 1.746e-01, 5.193e-02, -8.520e-02, -1.726e-01, 2.218e-02, 4.205e-01, 1.427e-01, 1.426e-02, -1.424e-02, -4.810e-02, -7.034e-02, -4.420e-02, 4.693e-02, -1.059e-01, 6.651e-02), r);
	r = MulAdd(s1_6, M4(1.277e-02, 1.624e-02, 2.739e-02, 5.491e-03, -1.327e-01, -6.239e-02, -6.986e-02, 4.944e-02, -8.849e-02, 1.346e-02, -2.343e-02, 8.993e-03, 1.826e-02, 4.062e-02, 2.256e-02, -2.188e-02), r);
	r = MulAdd(s1_7, M4(-3.195e-02, 5.322e-03, 2.795e-02, 4.747e-02, 5.449e-01, 8.519e-02, 2.450e-01, -1.700e-01, -8.635e-02, -1.283e-01, -3.413e-03, 3.388e-02, 1.035e-03, 3.489e-03, -4.357e-02, -8.121e-03), r);
	r = MulAdd(s1_8, M4(8.044e-02, -4.392e-02, 2.967e-02, 1.582e-02, -2.407e-01, -5.252e-02, -1.646e-01, 1.136e-01, 5.603e-03, -1.985e-02, 2.054e-02, 7.259e-03, -1.430e-02, 4.313e-02, -1.538e-03, -1.881e-02), r);
	r = MulAdd(s2_0, M4(2.887e-02, -3.079e-02, -2.064e-02, -1.707e-02, 1.619e-03, 9.501e-03, -3.989e-02, 6.850e-02, -1.874e-02, 4.451e-02, 1.375e-02, 3.694e-02, -5.088e-02, 2.351e-03, 1.058e-02, 1.304e-01), r);
	r = MulAdd(s2_1, M4(3.288e-02, 3.849e-02, -1.623e-02, 1.541e-02, -4.641e-02, -5.803e-02, -5.059e-02, 7.932e-02, 3.378e-02, 4.757e-02, 1.197e-02, -1.701e-01, -2.045e-01, -1.753e-01, -6.869e-03, 4.030e-01), r);
	r = MulAdd(s2_2, M4(1.061e-02, -1.051e-02, 1.275e-02, -7.823e-03, -2.037e-03, 1.164e-02, -3.866e-03, -1.300e-02, 3.747e-02, -1.642e-02, 2.196e-02, 2.024e-02, 5.399e-02, 1.077e-02, -8.838e-02, -3.091e-02), r);
	r = MulAdd(s2_3, M4(-3.355e-03, -6.519e-02, 9.655e-03, -2.412e-02, -2.784e-01, 3.036e-01, -6.591e-02, 5.922e-02, 5.079e-02, 2.057e-01, -4.058e-02, -1.246e-01, 8.695e-02, 2.195e-01, 1.155e-02, -3.856e-02), r);
	r = MulAdd(s2_4, M4(-1.359e-01, -6.361e-02, -2.067e-02, 6.341e-02, -4.197e-01, 2.943e-01, -6.616e-02, 2.731e-01, -9.448e-02, -2.046e-01, -1.120e-01, -4.872e-01, 5.110e-01, 5.573e-02, -2.966e-02, -2.035e-01), r);
	r = MulAdd(s2_5, M4(-2.759e-02, 1.413e-02, -3.434e-02, 1.415e-02, 4.479e-02, -3.119e-02, -3.333e-02, 6.124e-02, -6.661e-03, 6.655e-02, 1.934e-03, 9.448e-04, 2.794e-02, -2.752e-02, 2.138e-01, -8.479e-03), r);
	r = MulAdd(s2_6, M4(2.669e-04, 5.510e-03, 1.853e-02, 7.035e-03, -1.453e-01, -4.035e-02, -4.686e-02, 3.419e-02, 7.741e-02, -4.406e-02, -4.407e-02, -3.379e-02, -1.623e-02, 3.556e-02, -4.110e-03, 2.244e-03), r);
	r = MulAdd(s2_7, M4(7.366e-02, 3.837e-02, -9.561e-04, -6.028e-02, 1.728e-04, 3.019e-02, -4.455e-02, 5.838e-02, 2.192e-01, 1.857e-01, -3.371e-02, 1.257e-02, -1.802e-01, -1.366e-02, -5.060e-02, 1.007e-01), r);
	r = MulAdd(s2_8, M4(4.923e-03, 1.485e-02, 1.286e-02, -3.308e-02, -7.387e-03, 1.748e-02, -2.415e-02, 3.404e-02, 1.977e-02, -9.480e-02, -2.739e-02, -2.710e-02, -4.596e-03, 3.783e-03, -2.672e-02, 1.560e-03), r);
	r = MulAdd(s3_0, M4(1.821e-01, -1.726e-02, 6.138e-02, -2.267e-01, 3.341e-03, 3.308e-02, 2.540e-02, 5.495e-02, -6.068e-02, 1.206e-02, 3.137e-02, 9.503e-02, -8.975e-03, 4.392e-03, 3.276e-03, 8.800e-02), r);
	r = MulAdd(s3_1, M4(-8.920e-02, 2.005e-02, 2.098e-01, 2.013e-01, -6.256e-02, 5.254e-02, -7.322e-02, -6.087e-02, -1.509e-01, -6.047e-02, -1.060e-01, 1.994e-01, -3.721e-02, -6.304e-02, -7.775e-02, 1.860e-02), r);
	r = MulAdd(s3_2, M4(3.821e-02, 2.521e-02, -2.931e-01, -6.716e-02, 6.188e-02, -1.317e-02, 1.047e-01, -1.675e-02, -6.626e-02, 1.678e-02, 3.259e-02, 9.227e-02, 3.036e-02, 1.550e-02, 8.623e-02, 3.820e-02), r);
	r = MulAdd(s3_3, M4(-3.170e-01, 1.141e-01, -2.329e-01, -8.852e-03, -2.495e-01, 3.291e-01, -1.958e-02, -5.070e-02, 6.349e-02, 1.136e-01, 2.229e-02, -8.225e-02, -9.059e-02, 2.346e-01, 2.239e-02, 4.994e-02), r);
	r = MulAdd(s3_4, M4(4.100e-01, -5.625e-01, -8.633e-02, -1.264e-01, -5.683e-01, 3.076e-01, 6.387e-01, 7.517e-02, 1.931e-01, 6.261e-02, 9.888e-02, -1.452e-01, -6.821e-03, 1.317e-01, -1.053e-01, -8.811e-02), r);
	r = MulAdd(s3_5, M4(-9.594e-02, 8.279e-02, 1.886e-01, -4.391e-02, -2.549e-02, -3.722e-02, -5.847e-02, 1.530e-01, 1.440e-02, -1.658e-03, -5.252e-02, -3.569e-03, -8.633e-02, -3.385e-02, -6.311e-02, -2.457e-02), r);
	r = MulAdd(s3_6, M4(1.305e-01, -1.413e-02, 2.234e-02, -3.575e-02, -2.280e-01, -1.338e-01, -7.509e-02, 5.110e-02, 1.254e-02, -9.590e-02, -6.164e-02, -8.308e-03, -2.046e-02, 2.761e-02, -2.675e-02, 3.092e-02), r);
	r = MulAdd(s3_7, M4(2.543e-02, -3.494e-02, -5.345e-02, -2.582e-02, -1.284e-02, -2.036e-01, 1.160e-01, 7.207e-02, 4.682e-02, 1.265e-01, 4.585e-02, -1.319e-02, -6.320e-02, 9.944e-03, 3.594e-02, 5.627e-02), r);
	r = MulAdd(s3_8, M4(-1.190e-02, -1.152e-02, -6.325e-02, -1.505e-02, -2.987e-02, -4.424e-03, -7.566e-03, 1.253e-02, -2.009e-02, -4.585e-02, 2.023e-02, 1.135e-02, 5.201e-02, 1.751e-02, -7.444e-03, 1.437e-02), r);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { -1.351e-03, -1.537e-03, -2.278e-03, 6.430e-04 };
	r = MulAdd(s0_0, M4(1.417e-02, 2.377e-02, 7.056e-02, -1.708e-01, -4.606e-04, 1.689e-02, 4.567e-02, -1.430e-02, 2.483e-02, 3.123e-02, 4.566e-02, 2.764e-02, -1.146e-01, -3.580e-02, -1.005e-01, 6.299e-02), r);
	r = MulAdd(s0_1, M4(9.670e-02, -8.422e-02, 1.885e-02, 2.737e-01, -6.124e-02, -3.411e-02, 1.177e-02, -4.768e-02, 8.543e-02, -8.313e-02, 4.539e-02, -1.623e-02, 2.000e-01, -5.159e-02, -1.927e-01, 2.705e-01), r);
	r = MulAdd(s0_2, M4(-6.511e-03, 2.374e-02, -3.419e-02, 5.751e-02, 3.509e-03, -5.322e-03, -2.503e-02, -5.954e-02, 7.219e-02, -3.067e-02, -1.468e-02, 2.907e-03, -1.977e-01, -3.698e-02, -8.116e-02, 1.209e-02), r);
	r = MulAdd(s0_3, M4(-7.616e-03, 1.184e-01, -8.993e-02, 4.361e-02, 1.499e-02, -4.169e-03, -4.891e-02, 3.895e-02, 3.504e-02, -2.033e-02, -3.223e-02, -6.037e-02, -7.856e-02, -5.490e-02, 6.653e-02, 8.377e-02), r);
	r = MulAdd(s0_4, M4(1.822e-01, -2.446e-01, -2.585e-01, 1.245e-01, -5.140e-02, 1.930e-01, 1.942e-02, 1.680e-02, -2.178e-01, 1.537e-01, -1.011e+00, -4.548e-01, 3.506e-01, 2.489e-01, 2.640e-01, 1.546e-01), r);
	r = MulAdd(s0_5, M4(7.701e-02, -3.282e-02, -4.543e-02, -4.894e-02, 3.782e-03, -9.876e-04, 5.202e-02, -2.252e-02, 2.125e-01, 1.396e-01, -7.587e-02, -2.342e-01, -2.900e-01, 5.066e-02, 2.059e-01, -1.003e-01), r);
	r = MulAdd(s0_6, M4(1.741e-02, -5.800e-02, -1.900e-02, -1.103e-01, -1.544e-02, 1.248e-02, -2.188e-02, 2.753e-02, -6.159e-03, 3.643e-02, 2.510e-02, -1.613e-02, -1.972e-02, -5.235e-02, -3.681e-02, 5.366e-02), r);
	r = MulAdd(s0_7, M4(1.740e-02, 1.441e-01, 3.796e-02, 2.267e-02, -2.051e-02, -1.564e-01, -6.488e-02, -6.801e-02, 3.368e-02, -6.264e-02, 1.791e-02, -5.944e-02, -4.683e-02, -1.256e-02, -6.540e-02, 3.546e-02), r);
	r = MulAdd(s0_8, M4(-6.253e-02, -2.709e-02, 3.707e-02, 1.718e-02, -3.430e-02, 3.417e-02, -5.257e-02, -3.959e-04, 2.007e-01, -7.594e-02, -3.454e-02, -2.327e-05, -2.480e-02, 5.655e-02, -4.720e-02, 3.006e-02), r);
	r = MulAdd(s1_0, M4(3.999e-03, 1.203e-03, -9.676e-03, -2.231e-02, 3.772e-02, 2.693e-02, 2.818e-02, 2.641e-02, -3.407e-02, -1.765e-02, -9.486e-04, -5.857e-03, -8.413e-02, 6.513e-03, -2.523e-02, 2.071e-02), r);
	r = MulAdd(s1_1, M4(6.666e-02, -5.555e-02, -1.008e-01, 2.823e-01, -4.945e-02, 3.166e-03, 1.133e-01, -3.291e-01, 1.761e-01, -1.626e-02, 6.843e-02, 2.767e-02, 2.024e-01, -9.546e-02, -1.752e-01, 6.617e-02), r);
	r = MulAdd(s1_2, M4(-1.210e-01, -6.037e-02, 5.096e-02, 8.161e-02, 1.310e-01, -2.582e-02, -6.073e-02, 5.804e-02, 5.883e-02, -5.329e-02, -1.231e-02, -2.863e-02, -5.061e-02, -2.119e-02, -9.351e-02, -8.943e-02), r);
	r = MulAdd(s1_3, M4(-3.971e-02, -1.009e-01, -6.313e-02, 1.450e-01, 6.074e-02, -6.618e-02, -6.086e-04, 1.100e-01, 7.771e-02, 9.246e-02, -8.459e-02, -7.042e-03, -1.138e-01, 1.398e-01, -5.286e-02, 6.421e-02), r);
	r = MulAdd(s1_4, M4(-3.669e-01, -6.429e-02, 4.852e-02, -5.484e-01, -1.444e-01, 5.232e-01, -2.359e-02, -2.425e-01, 3.525e-01, 1.542e-01, -1.489e-01, 1.304e-01, 2.430e-01, 3.256e-01, 2.770e-01, 4.015e-02), r);
	r = MulAdd(s1_5, M4(-1.860e-01, 9.205e-02, -6.804e-02, 2.132e-01, 3.018e-01, -1.301e-02, 2.932e-01, -4.531e-01, -1.165e-01, 1.096e-01, -5.766e-02, 2.676e-02, -8.117e-02, -4.090e-02, 7.593e-02, -1.031e-01), r);
	r = MulAdd(s1_6, M4(2.984e-02, 5.976e-03, 2.771e-02, -1.520e-02, 9.423e-03, 7.899e-02, -3.400e-02, -1.421e-01, -1.602e-02, -1.316e-02, -1.356e-02, -1.758e-02, -3.187e-02, -8.554e-02, -6.954e-02, 6.126e-02), r);
	r = MulAdd(s1_7, M4(3.484e-03, 5.750e-02, 4.953e-02, 1.427e-02, 5.264e-02, 1.119e-01, -1.563e-01, -1.832e-02, 7.184e-02, 1.238e-01, 1.384e-03, -2.950e-03, -2.796e-02, -1.510e-02, -5.581e-02, 5.907e-03), r);
	r = MulAdd(s1_8, M4(-2.979e-02, -1.805e-01, 4.289e-02, 7.547e-02, 1.453e-01, 1.155e-01, -6.317e-02, 2.787e-03, 6.409e-02, -2.811e-02, 2.758e-02, 1.464e-02, -4.431e-02, 2.005e-03, -3.745e-02, 6.827e-03), r);
	r = MulAdd(s2_0, M4(2.466e-02, 3.345e-02, 1.015e-02, 1.515e-02, 1.833e-02, 8.333e-03, -2.040e-03, 6.401e-02, 2.044e-02, -6.089e-02, -5.738e-02, 3.871e-02, -1.920e-02, 5.305e-02, 5.707e-02, 2.513e-02), r);
	r = MulAdd(s2_1, M4(7.018e-02, -2.544e-02, -2.494e-02, 1.088e-01, -3.509e-02, -8.734e-03, -1.365e-02, -9.361e-02, -2.252e-03, -1.209e-02, -3.689e-02, 7.149e-02, -3.120e-02, 2.036e-02, 4.506e-02, -3.936e-01), r);
	r = MulAdd(s2_2, M4(-8.268e-03, -1.327e-02, 2.513e-02, -4.868e-03, -2.935e-02, 8.804e-03, -5.610e-03, 2.269e-02, 6.841e-03, -6.307e-04, -6.629e-02, -4.883e-03, 2.400e-02, -7.430e-04, -8.010e-03, 7.980e-02), r);
	r = MulAdd(s2_3, M4(-1.232e-02, -2.083e-02, -1.672e-02, -2.160e-02, 1.786e-02, -2.434e-01, -7.226e-02, 1.562e-01, 9.118e-02, -1.116e-01, -6.319e-02, 2.136e-01, -3.159e-03, -1.786e-01, -6.991e-02, 4.110e-01), r);
	r = MulAdd(s2_4, M4(-2.275e-02, -2.278e-02, -1.567e-01, 9.989e-02, -1.908e-01, 5.677e-02, -1.271e-01, -2.123e-02, -1.454e-01, 2.650e-01, 2.278e-01, -1.885e-01, -9.669e-02, 1.841e-01, 7.851e-02, -6.762e-01), r);
	r = MulAdd(s2_5, M4(-3.343e-02, 3.704e-02, -3.857e-02, 8.299e-03, 2.396e-02, -5.124e-02, -5.434e-02, 7.997e-02, 3.406e-02, -4.136e-02, 1.870e-02, 3.525e-02, -6.752e-02, -8.309e-02, 1.068e-01, 3.379e-03), r);
	r = MulAdd(s2_6, M4(7.981e-03, -4.048e-02, 3.325e-02, -9.389e-03, -1.876e-02, 2.147e-02, -6.131e-02, 9.452e-02, 1.583e-02, 1.418e-01, -1.309e-02, 1.419e-02, -4.582e-03, -4.512e-03, -3.895e-02, 4.452e-02), r);
	r = MulAdd(s2_7, M4(-3.083e-03, 3.093e-03, 2.493e-02, 3.899e-02, 4.875e-02, -1.968e-01, -1.069e-01, 6.543e-02, -1.266e-01, -2.185e-01, -9.342e-02, 9.424e-02, -5.126e-02, -1.718e-02, 9.158e-03, -3.601e-02), r);
	r = MulAdd(s2_8, M4(-3.967e-02, -3.457e-02, 2.545e-02, 2.582e-02, 1.908e-03, 3.888e-02, -4.385e-02, -2.935e-02, 7.741e-02, 3.255e-02, -3.235e-02, 6.256e-02, 1.187e-02, -2.497e-02, -3.396e-02, 1.806e-02), r);
	r = MulAdd(s3_0, M4(1.117e-02, -7.495e-02, -4.312e-02, 6.636e-05, -3.893e-02, -4.891e-02, -8.095e-02, 8.555e-02, -1.992e-02, 1.728e-03, -3.447e-02, -8.472e-02, -1.013e-02, 2.594e-02, 6.256e-02, 1.169e-01), r);
	r = MulAdd(s3_1, M4(-1.737e-01, 9.502e-02, -3.787e-02, 1.131e-01, 4.237e-02, 1.059e-02, -8.165e-02, 9.365e-03, 1.581e-03, 1.907e-01, 1.439e-01, -2.903e-02, 9.400e-03, -4.874e-02, -1.740e-02, -9.145e-02), r);
	r = MulAdd(s3_2, M4(2.043e-01, 1.205e-03, -3.967e-02, 5.084e-02, -3.772e-02, -1.594e-02, -2.356e-02, 4.866e-02, -3.912e-02, -1.182e-01, 1.228e-01, 2.044e-02, -2.094e-02, -6.302e-03, -8.608e-03, 4.632e-02), r);
	r = MulAdd(s3_3, M4(1.028e-02, -2.428e-01, 4.209e-02, 2.619e-01, 8.150e-02, -2.480e-01, -3.820e-02, 9.968e-02, 4.816e-02, -4.436e-02, -3.762e-03, 6.175e-02, -3.872e-03, -2.305e-01, -1.069e-01, 1.947e-01), r);
	r = MulAdd(s3_4, M4(-7.723e-01, 9.837e-02, 3.011e-02, 1.353e-01, -1.254e-01, 2.501e-01, 1.086e-02, -4.026e-01, -2.775e-02, -2.070e-01, -1.837e-01, -1.658e-01, 1.984e-02, 1.307e-01, 1.635e-01, -1.135e-01), r);
	r = MulAdd(s3_5, M4(1.416e-02, -9.369e-02, 1.069e-01, -8.305e-02, 1.437e-01, -3.897e-02, -6.218e-02, 3.772e-02, 6.288e-02, 1.493e-01, -5.597e-02, 7.343e-02, -8.453e-02, 4.008e-02, -1.136e-02, 1.524e-02), r);
	r = MulAdd(s3_6, M4(6.399e-02, 1.405e-01, -1.857e-03, 3.541e-02, -1.237e-02, 1.110e-01, -4.929e-02, 2.277e-02, 1.195e-02, 1.679e-01, 2.287e-02, 1.645e-02, 8.215e-03, -2.489e-04, -3.430e-02, -2.937e-02), r);
	r = MulAdd(s3_7, M4(-5.599e-02, 3.509e-01, 7.784e-02, 3.488e-02, 5.222e-01, -1.892e-01, 1.668e-02, 1.938e-04, -8.913e-03, -3.322e-02, -6.398e-02, -2.276e-02, 5.969e-04, -1.078e-01, -5.033e-03, 6.744e-02), r);
	r = MulAdd(s3_8, M4(-1.150e-02, -5.770e-02, -1.917e-02, 1.003e-01, 2.864e-02, -6.083e-04, -2.847e-02, -1.435e-02, 4.296e-02, -9.737e-04, -4.802e-02, 6.634e-02, 4.267e-02, -3.589e-02, 1.502e-02, 1.632e-02), r);
	return r;
}

void Pass7(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 8
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = { 1.205e-04, 8.082e-04, 5.443e-04, -3.668e-04 };
	r = MulAdd(s0_0, M4(1.665e-02, -8.034e-03, -1.825e-02, 3.618e-03, -1.946e-03, -9.426e-03, -8.760e-03, 1.811e-02, 5.287e-02, -2.521e-02, 1.465e-02, -2.874e-02, -1.678e-04, -5.970e-03, -2.613e-03, -4.475e-03), r);
	r = MulAdd(s0_1, M4(1.441e-01, 5.668e-02, 9.456e-03, -6.067e-02, 2.085e-01, 9.162e-02, 3.905e-02, -2.220e-02, -9.590e-02, -4.507e-02, -2.980e-02, 3.405e-02, 5.457e-02, 1.862e-02, 2.969e-02, -3.874e-03), r);
	r = MulAdd(s0_2, M4(-3.635e-02, 4.605e-02, -2.088e-02, -1.038e-03, -5.091e-02, 3.757e-02, -2.177e-02, -3.267e-03, 3.186e-02, 3.149e-03, 7.400e-03, 3.707e-03, -2.361e-02, -4.816e-03, -1.245e-02, 1.252e-02), r);
	r = MulAdd(s0_3, M4(-2.961e-02, -3.446e-03, 9.278e-03, -3.365e-02, 1.645e-03, -1.782e-02, -5.623e-03, -4.475e-02, 2.640e-02, -1.904e-02, -1.195e-01, 1.154e-01, -2.134e-02, 6.999e-03, -1.240e-02, -2.264e-02), r);
	r = MulAdd(s0_4, M4(-4.989e-01, -3.369e-01, 5.449e-01, 3.551e-01, 2.530e-01, 1.923e-01, 4.218e-01, 2.627e-01, -4.863e-02, 1.521e-01, 6.711e-02, -2.250e-01, -8.437e-02, -2.255e-01, -1.239e-02, 7.682e-02), r);
	r = MulAdd(s0_5, M4(-3.139e-02, -1.480e-01, -2.284e-02, 8.665e-02, -4.610e-02, 4.704e-02, -3.240e-02, 8.506e-02, 1.377e-03, 2.643e-02, 2.989e-03, 8.917e-03, -2.035e-02, 3.411e-02, 1.877e-02, -4.490e-02), r);
	r = MulAdd(s0_6, M4(9.737e-03, 1.686e-02, -3.094e-02, 2.318e-02, -6.414e-03, -2.440e-03, 2.925e-03, -1.545e-03, 4.439e-02, 9.093e-04, 3.674e-01, -3.203e-01, 5.221e-02, 2.167e-02, 4.061e-02, 4.237e-02), r);
	r = MulAdd(s0_7, M4(2.009e-02, -4.379e-03, 6.470e-02, -6.519e-02, -2.176e-02, -8.537e-03, 2.645e-02, 1.331e-02, 3.747e-02, 4.555e-02, 3.777e-02, -1.084e-02, 1.072e-01, 1.385e-01, 1.934e-02, -9.520e-02), r);
	r = MulAdd(s0_8, M4(1.143e-02, 1.461e-02, -4.811e-02, 1.175e-02, -1.676e-02, -2.844e-02, -4.268e-02, -2.958e-02, -5.801e-03, 1.108e-03, -1.108e-02, 4.385e-02, -9.585e-03, -1.232e-02, -3.864e-02, 1.812e-02), r);
	r = MulAdd(s1_0, M4(6.319e-04, -1.321e-02, -4.810e-03, 3.134e-03, 1.949e-02, 8.926e-04, 3.095e-04, 1.404e-02, 2.945e-02, -1.625e-02, -1.022e-03, -8.526e-03, 1.548e-02, -1.219e-02, -9.063e-03, -8.040e-03), r);
	r = MulAdd(s1_1, M4(1.512e-01, 5.866e-02, 2.923e-02, -2.301e-02, 1.588e-01, 1.093e-01, 4.457e-02, -2.858e-02, -9.663e-02, -4.469e-02, -3.315e-02, 1.949e-02, 6.519e-02, 4.569e-02, 2.538e-02, -2.218e-02), r);
	r = MulAdd(s1_2, M4(-4.665e-02, 2.594e-02, -1.993e-02, 1.175e-02, -3.333e-02, 2.913e-02, -9.827e-03, 3.326e-05, 3.552e-02, -3.633e-03, 8.832e-03, 9.121e-04, -2.972e-02, -8.319e-03, -1.437e-02, 1.409e-02), r);
	r = MulAdd(s1_3, M4(-1.715e-02, 2.335e-02, -1.714e-02, -3.837e-02, -8.227e-02, 8.352e-04, 6.854e-04, -3.553e-02, 3.693e-02, -8.114e-02, -1.259e-02, 1.459e-02, -1.087e-01, 3.946e-04, 2.406e-02, -1.555e-02), r);
	r = MulAdd(s1_4, M4(-4.320e-02, -1.420e-01, 1.501e-01, 1.979e-01, 4.555e-01, 5.655e-02, 4.005e-01, 4.561e-01, -6.498e-02, 1.753e-01, 7.693e-02, -2.573e-01, -1.655e-01, -3.975e-01, -6.450e-03, 1.195e-01), r);
	r = MulAdd(s1_5, M4(-3.430e-02, -4.627e-02, -3.033e-02, 9.612e-03, -4.261e-02, 9.353e-02, -5.574e-02, 7.956e-02, -1.190e-03, 1.946e-02, 9.613e-03, 2.960e-02, -1.353e-02, 6.958e-02, 1.888e-02, -3.233e-02), r);
	r = MulAdd(s1_6, M4(-3.424e-03, 8.490e-03, 4.527e-03, 2.695e-02, 9.218e-03, -3.838e-04, 5.782e-03, -1.419e-02, 6.274e-02, -1.145e-02, 4.382e-02, -1.952e-02, -1.567e-03, 1.579e-02, 1.813e-01, 3.017e-02), r);
	r = MulAdd(s1_7, M4(3.557e-02, 1.086e-02, -6.621e-02, -9.656e-02, 4.175e-02, 5.237e-02, -1.381e-01, -1.110e-01, 3.749e-02, 5.361e-02, 3.329e-02, 8.597e-02, -9.733e-03, -8.791e-02, 1.724e-01, 3.238e-01), r);
	r = MulAdd(s1_8, M4(1.282e-02, 7.556e-03, -2.733e-02, -1.689e-02, -3.745e-02, -4.170e-02, -3.300e-02, -4.508e-03, 3.246e-03, 5.182e-03, -1.597e-02, 3.762e-02, 3.515e-03, 8.707e-04, -2.837e-02, 2.238e-02), r);
	r = MulAdd(s2_0, M4(2.606e-02, -4.157e-03, 1.889e-02, -1.571e-02, -1.264e-03, 1.164e-02, 3.781e-02, 2.570e-02, -1.546e-02, 5.382e-03, 3.603e-02, 2.516e-02, 6.918e-03, 1.161e-02, 3.155e-03, -1.756e-03), r);
	r = MulAdd(s2_1, M4(-4.998e-02, -6.958e-02, -5.387e-02, 2.511e-02, -1.032e-02, -1.567e-01, 1.284e-01, 4.528e-02, -5.753e-02, 1.843e-02, 6.067e-02, 1.268e-02, -8.389e-02, -3.496e-02, -1.261e-02, 1.271e-02), r);
	r = MulAdd(s2_2, M4(2.201e-02, 1.038e-02, 1.562e-03, 6.786e-03, 3.414e-02, 6.890e-02, -2.765e-02, 2.985e-02, 1.279e-02, -3.082e-02, 1.793e-03, 9.163e-03, 2.754e-02, -2.456e-02, 7.180e-03, 1.141e-03), r);
	r = MulAdd(s2_3, M4(6.714e-02, 1.266e-02, 1.392e-03, 7.777e-02, 1.648e-02, -3.943e-02, 7.426e-02, -1.875e-02, 3.643e-01, 7.651e-02, -3.760e-01, -3.663e-02, -2.532e-02, -5.826e-03, 1.486e-02, 2.171e-02), r);
	r = MulAdd(s2_4, M4(3.235e-01, -3.337e-01, 2.851e-01, -4.555e-01, 2.373e-01, 3.584e-01, -4.600e-01, 1.548e-01, 5.534e-01, 6.629e-01, 9.779e-03, -4.056e-01, 8.883e-02, -1.558e-01, -1.062e-01, -1.018e-01), r);
	r = MulAdd(s2_5, M4(-4.431e-02, 1.741e-02, -2.464e-02, 4.318e-02, -1.063e-02, -9.543e-02, 9.203e-02, 3.816e-03, -8.191e-02, 3.298e-02, -3.944e-02, -1.789e-02, -3.580e-02, 9.644e-02, 2.248e-02, -5.676e-03), r);
	r = MulAdd(s2_6, M4(-1.707e-02, -1.871e-03, 6.918e-03, -9.750e-03, 1.813e-02, 1.149e-02, 1.491e-02, -2.354e-03, -6.421e-02, -2.200e-03, 6.571e-02, 3.823e-02, 1.606e-02, 1.188e-02, -2.462e-04, -6.835e-03), r);
	r = MulAdd(s2_7, M4(-1.709e-02, -3.317e-02, 7.031e-02, -7.599e-02, -3.559e-03, 5.356e-03, 1.223e-01, 5.016e-03, -3.943e-02, -5.634e-02, 6.522e-02, 1.959e-02, 2.838e-03, 3.219e-02, 8.960e-02, -2.077e-02), r);
	r = MulAdd(s2_8, M4(-1.128e-02, -1.450e-02, -2.515e-02, -1.278e-02, 1.182e-02, -6.085e-03, -3.478e-02, -7.803e-05, 2.175e-03, -1.893e-02, -3.165e-02, 1.144e-02, -4.075e-03, -1.776e-02, -2.098e-02, 2.191e-02), r);
	r = MulAdd(s3_0, M4(3.673e-02, -7.948e-03, 1.825e-02, -2.366e-02, -4.290e-03, 1.377e-02, 8.323e-03, 1.511e-02, -5.298e-03, 2.701e-03, 8.641e-03, 4.330e-03, 1.059e-03, 1.412e-02, -7.184e-03, 3.384e-04), r);
	r = MulAdd(s3_1, M4(-1.180e-01, -1.494e-02, -2.777e-02, 3.327e-02, 2.241e-02, -4.968e-02, -2.362e-02, 6.181e-02, 3.152e-02, 5.492e-03, 1.043e-02, -2.723e-02, -6.920e-02, -5.721e-02, -2.918e-03, 9.618e-03), r);
	r = MulAdd(s3_2, M4(4.164e-02, 4.799e-03, 7.672e-03, 6.442e-03, 2.289e-02, -5.849e-03, -1.497e-02, 3.935e-02, -9.540e-03, -5.052e-03, -6.008e-03, 3.086e-04, 1.775e-02, 5.478e-03, 6.428e-03, 3.996e-03), r);
	r = MulAdd(s3_3, M4(-4.285e-02, 2.518e-02, -6.202e-02, 7.750e-02, 3.723e-02, -1.612e-02, 1.900e-03, 1.359e-02, 3.029e-03, 2.731e-03, -5.163e-03, -2.669e-02, -3.451e-03, 1.188e-02, 2.654e-02, 4.026e-02), r);
	r = MulAdd(s3_4, M4(1.114e-01, 2.722e-02, 8.689e-02, -2.510e-01, 3.922e-02, 1.319e-01, 1.630e-01, -7.910e-02, 2.532e-01, 1.401e-01, 1.096e-01, 1.772e-01, 3.884e-01, -4.444e-01, 5.710e-02, -2.594e-01), r);
	r = MulAdd(s3_5, M4(-3.528e-02, 1.519e-03, -1.285e-02, 2.613e-02, -5.286e-02, 3.211e-02, 1.899e-02, -8.061e-03, -6.238e-02, 7.851e-03, -1.807e-02, -3.191e-02, -3.676e-02, 5.701e-02, 6.615e-03, 1.207e-02), r);
	r = MulAdd(s3_6, M4(-8.811e-03, -2.217e-03, -8.144e-03, -2.373e-02, 1.499e-02, 6.994e-03, 2.934e-02, -5.142e-03, -6.236e-03, 7.265e-04, -2.347e-03, 1.380e-02, 1.357e-02, 1.125e-02, 7.815e-03, 1.489e-03), r);
	r = MulAdd(s3_7, M4(-2.411e-02, -3.528e-02, -1.130e-02, 2.333e-02, 1.379e-02, 2.206e-02, 9.893e-03, 3.702e-02, -2.773e-02, -7.861e-04, 9.765e-02, 7.302e-03, 3.641e-02, 2.749e-02, 2.540e-01, -1.831e-01), r);
	r = MulAdd(s3_8, M4(-2.727e-03, -1.083e-02, -1.462e-02, -1.577e-02, -2.030e-03, -6.144e-03, -2.325e-02, 1.531e-02, 2.263e-04, -1.137e-02, -2.704e-02, -1.031e-02, -1.318e-02, -8.554e-03, -3.022e-02, 1.633e-02), r);
	return tanh(r);
}

void Pass8(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);

	static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
	static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
