e4s-game/gamedata/shaders/d3d11/taa_render.ps.hlsl

/*
		Simple TAA

		References:
		- https://gdcvault.com/play/1022970/Temporal-Reprojection-Anti-Aliasing-in
		- https://research.nvidia.com/labs/rtr/publication/yang2020survey/
		- https://github.com/iryoku/smaa
		- https://michaldrobot.com/2014/08/13/hraa-siggraph-2014-slides-available/
		- https://gpuopen.com/learn/optimized-reversible-tonemapper-for-resolve/
		- https://research.activision.com/publications/2020-03/dynamic-temporal-antialiasing-and-upsampling-in-call-of-duty

        Author:
        - LVutner

        ---IX-Ray Engine---
*/

#include "common.hlsli"

struct PSInput
{
    float4 hpos : SV_POSITION;
    float4 texcoord : TEXCOORD0;
};

Texture2D s_image_prev; //Previous rt_generic_0
float4 scaled_screen_res; //Render resolution

//Settings...
#define TAA_ALT_PATH //Different min-max estimation. Old path may be slower [todo: check]
#define TAA_BLEND_WEIGHT 0.925 //Blend weight
#define TAA_HISTORY_SHARPNESS 0.75 //Sharpness factor for history filtering
#define TAA_DEVIATION 1.75 //Deviation. 1.75 pix

//From Timothy Lottes
float3 Lottes_Tonemap(float3 c)
{
	return saturate(c * rcp(1.0f + c));
}

float3 Lottes_Tonemap_Inverse(float3 c)
{
	c = saturate(c);
	return c * rcp(1.00001f - c);
}

static const int2 offset_3x3[9] =
{
	int2(-1, -1),
	int2(0, -1),
	int2(1, -1),
	int2(-1, 0),
	int2(0, 0),
	int2(1, 0),
	int2(-1, 1),
	int2(0, 1),
	int2(1, 1),
};

//From CoD presentation
float3 SMAABicubicFilter(
	float3 current_top,
	float3 current_bottom,
	float3 current_left,
	float3 current_right,
	float3 current_center,
	float3 previous_center,
	float2 f)
{
	float2 w = 0.8 * TAA_HISTORY_SHARPNESS * (f * f - f); //hardcoded sharpness, refer to slides
	float4 color =
		float4(lerp(current_left, current_right, f.x), 1.0) * w.x +
		float4(lerp(current_top, current_bottom, f.y), 1.0) * w.y;

	color += float4((1.0 + color.w) * previous_center - color.w * current_center, 1.0);
	return color.xyz / color.w;
}

//Cheapest way to get 3x3 neighborhood of single channel texture
void get_3x3_depth(float2 texcoord, float2 gather_texcoord, inout float d_3x3[9])
{
	float4 d_gather0 = s_position.Gather(smp_nofilter, gather_texcoord);
	float4 d_gather1 = s_position.Gather(smp_nofilter, gather_texcoord, int2(-1, -1));

	d_3x3[0] = d_gather1.w;
	d_3x3[1] = d_gather1.z;
	d_3x3[2] = s_position.SampleLevel(smp_nofilter, texcoord, 0, int2(1, -1)).x;
	d_3x3[3] = d_gather1.x;
	d_3x3[4] = d_gather0.w; //d_gather1.y overlap
	d_3x3[5] = d_gather0.z;
	d_3x3[6] = s_position.SampleLevel(smp_nofilter, texcoord, 0, int2(-1, 1)).x;
	d_3x3[7] = d_gather0.x;
	d_3x3[8] = d_gather0.y;
}

//SM_5 path, we save 1 sample
#ifdef SM_5
void get_3x3_color(float2 texcoord, float2 gather_texcoord, inout float3 c_3x3[9])
{
	float4 c_gather0_r = s_image.GatherRed(smp_nofilter, gather_texcoord);
	float4 c_gather0_g = s_image.GatherGreen(smp_nofilter, gather_texcoord);
	float4 c_gather0_b = s_image.GatherBlue(smp_nofilter, gather_texcoord);

	c_3x3[0] = s_image.SampleLevel(smp_nofilter, texcoord, 0, int2(-1, -1)).xyz;
	c_3x3[1] = s_image.SampleLevel(smp_nofilter, texcoord, 0, int2(0, -1)).xyz;
	c_3x3[2] = s_image.SampleLevel(smp_nofilter, texcoord, 0, int2(1, -1)).xyz;
	c_3x3[3] = s_image.SampleLevel(smp_nofilter, texcoord, 0, int2(-1, 0)).xyz;
	c_3x3[4] = float3(c_gather0_r.w, c_gather0_g.w, c_gather0_b.w);
	c_3x3[5] = float3(c_gather0_r.z, c_gather0_g.z, c_gather0_b.z);
	c_3x3[6] = s_image.SampleLevel(smp_nofilter, texcoord, 0, int2(-1, 1)).xyz;
	c_3x3[7] = float3(c_gather0_r.x, c_gather0_g.x, c_gather0_b.x);
	c_3x3[8] = float3(c_gather0_r.y, c_gather0_g.y, c_gather0_b.y);
}
#endif

float4 main(PSInput I) : SV_Target
{
	//https://wojtsterna.blogspot.com/2018/02/directx-11-hlsl-gatherred.html
	float2 gather_texcoord = I.texcoord.xy + scaled_screen_res.zw * 0.5;

	//Fetch 3x3 depth neighborhood
	float d_3x3[9];
	get_3x3_depth(I.texcoord.xy, gather_texcoord, d_3x3);

	//Fetch 3x3 color neighborhood
	float3 c_3x3[9];
	#ifdef SM_5
		get_3x3_color(I.texcoord.xy, gather_texcoord, c_3x3);
	#endif

	int2 depth_offset = int2(0, 0);
	float depth_closest = 1.0;

	#ifdef TAA_ALT_PATH
		float3 c_m = (0.0).xxx;
		float3 c_m2 = (0.0).xxx;
	#endif

	[unroll]
	for (int i = 0; i < 9; i++)
	{
		#ifdef SM_5
			c_3x3[i] = Lottes_Tonemap(c_3x3[i]);
		#else
			int2 offset_hpos = clamp(I.hpos.xy + offset_3x3[i], 0, scaled_screen_res.xy - 1);
			c_3x3[i] = Lottes_Tonemap(s_image[offset_hpos].xyz);
		#endif

		//Accumulate moments
		#ifdef TAA_ALT_PATH
			c_m += c_3x3[i] * (1.0 / 9.0);
			c_m2 += c_3x3[i] * c_3x3[i] * (1.0 / 9.0);
		#endif

		float sampled_depth = d_3x3[i];

		//Find closest depth. Sign and initial value should be changed for reverse-z
		if(sampled_depth < depth_closest)
		{
			depth_closest = sampled_depth;
			depth_offset = offset_3x3[i];
		}
	}

	//Get min and max color of 3x3 neighborhood
	#ifdef TAA_ALT_PATH
		//1.75 is for stability
		float3 c_stddev = sqrt(max(c_m2 - c_m * c_m, 0.0));
		float3 c_min = c_m - c_stddev * TAA_DEVIATION;
		float3 c_max = c_m + c_stddev * TAA_DEVIATION;
	#else
		//Soft window
		float3 c_min = min(c_3x3[0], min(c_3x3[1], min(c_3x3[2], min(c_3x3[3], min(c_3x3[4], min(c_3x3[5], min(c_3x3[6], min(c_3x3[7], c_3x3[8]))))))));
		c_min += min(c_3x3[1], min(c_3x3[3], min(c_3x3[4], min(c_3x3[5], c_3x3[7]))));
		c_min *= 0.5;

		float3 c_max = max(c_3x3[0], max(c_3x3[1], max(c_3x3[2], max(c_3x3[3], max(c_3x3[4], max(c_3x3[5], max(c_3x3[6], max(c_3x3[7], c_3x3[8]))))))));
		c_max += max(c_3x3[1], max(c_3x3[3], max(c_3x3[4], max(c_3x3[5], c_3x3[7]))));
		c_max *= 0.5;
	#endif

	//Fetch motion vectors and reproject
	float2 motion_vector = s_velocity[clamp(I.hpos.xy + depth_offset, 0, scaled_screen_res.xy - 1)].xy * float2(0.5, -0.5);
	float2 reprojected_tc = I.texcoord.xy - motion_vector;

	//Early quit
	if(any(reprojected_tc != saturate(reprojected_tc)))
		return float4(Lottes_Tonemap_Inverse(c_3x3[4]), 0.0);

	//Fetch previous frame
	float3 p_4 = Lottes_Tonemap(s_image_prev.SampleLevel(smp_rtlinear, reprojected_tc, 0).xyz);

	//Spatio-temporal bicubic filter
	p_4 = SMAABicubicFilter(c_3x3[1], c_3x3[7], c_3x3[3], c_3x3[5], c_3x3[4], p_4, frac(reprojected_tc * scaled_screen_res.xy - 0.5));

	//Clamp history
	p_4 = clamp(p_4, c_min, c_max);

	//SMAA-ish velocity weighting. Something better should be used...
	float2 p_motion_vector = s_velocity[reprojected_tc * scaled_screen_res.xy].xy * float2(0.5, -0.5);

	float2 mags = (0.0).xx;
	mags.x = sqrt(5.0 * length(motion_vector));
	mags.y = sqrt(5.0 * length(p_motion_vector));

	float delta = abs(mags.x * mags.x - mags.y * mags.y) * (1.0 / 5.0);
	float weight = TAA_BLEND_WEIGHT * saturate(1.0 - sqrt(delta) * 8.0);

	//Simple lerp is ok, RGBA16F lmao
	float3 reprojected_color = lerp(c_3x3[4], p_4, weight);

	reprojected_color = max(reprojected_color, 0.0);
	reprojected_color = Lottes_Tonemap_Inverse(reprojected_color);

	return float4(reprojected_color, 1.0);
}