e4s-sdk/gamedata/shaders/r3/shadow.h

#ifndef	SHADOW_H
#define SHADOW_H

#include "common.h"

//uniform	sampler	s_smap	: register(ps,s0);	// 2D/cube shadowmap
//Texture2D<float>	s_smap;		// 2D/cube shadowmap
//	Used for RGBA texture too ?!
Texture2D	s_smap : register(ps,t0);		// 2D/cube shadowmap

Texture2D<float>	s_smap_minmax;		// 2D/cube shadowmap
#include "gather.ps"

SamplerComparisonState		smp_smap;	//	Special comare sampler
sampler		smp_jitter;

Texture2D	jitter0;
Texture2D	jitter1;
//uniform sampler2D	jitter2;
//uniform sampler2D	jitter3;
//uniform float4 		jitterS;

Texture2D	jitterMipped;

#ifndef USE_ULTRA_SHADOWS
#define	KERNEL	0.6f
#else
#define	KERNEL	1.0f
#endif

float modify_light( float light )
{
   return ( light > 0.7 ? 1.0 : lerp( 0.0, 1.0, saturate( light / 0.7 ) ) );
}

//////////////////////////////////////////////////////////////////////////////////////////
// hardware + PCF
//////////////////////////////////////////////////////////////////////////////////////////
float sample_hw_pcf (float4 tc,float4 shift)
{
	static const float 	ts = KERNEL / float(SMAP_size);

	tc.xyz 	/= tc.w;
	tc.xy 	+= shift.xy * ts;

	return s_smap.SampleCmpLevelZero( smp_smap, tc.xy, tc.z).x;
}

#define GS2 3

float shadow_hw( float4 tc )
{
  	float	s0		= sample_hw_pcf( tc, float4( -1, -1, 0, 0) );
  	float	s1		= sample_hw_pcf( tc, float4( +1, -1, 0, 0) );
  	float	s2		= sample_hw_pcf( tc, float4( -1, +1, 0, 0) );
  	float	s3		= sample_hw_pcf( tc, float4( +1, +1, 0, 0) );

	return	(s0+s1+s2+s3)/4.h;
}

#if SUN_QUALITY>=4
#define FILTER_SIZE	11
#define FS  FILTER_SIZE
#define FS2 ( FILTER_SIZE / 2 )

static const float W2[11][11] =
                 { { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2,0.0 },
			       { 0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       };

static const float W1[11][11] =
                 { { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,1.0,1.0,1.0,1.0,1.0,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,1.0,1.0,1.0,1.0,1.0,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,1.0,1.0,1.0,1.0,1.0,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,1.0,1.0,1.0,1.0,1.0,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,1.0,1.0,1.0,1.0,1.0,0.2,0.0,0.0 },
			       { 0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       };

static const float W0[11][11] =
                 { { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.1,1.0,0.1,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       { 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 },
			       };

float Fw( int r, int c, float fL )
{
	return        (1.0-fL) * (1.0-fL) * (1.0-fL) * W0[r][c] +
	       3.0f * (1.0-fL) * (1.0-fL) *   fL     * W1[r][c] +
	       3.0f *    fL    *    fL    * (1.0-fL) * W2[r][c] +
                     fL    *    fL    *   fL     * 1.0f;
}

#define BLOCKER_FILTER_SIZE	11
#define BFS  BLOCKER_FILTER_SIZE
#define BFS2 ( BLOCKER_FILTER_SIZE / 2 )

#define SUN_WIDTH 300.0f

// uses gather for DX11/10.1 and visibilty encoding for DX10.0
float shadow_extreme_quality( float3 tc )
{
   float  s   = 0.0f;
   float2 stc = ( SMAP_size * tc.xy ) + float2( 0.5, 0.5 );
   float2 tcs = floor( stc );
   float2 fc;
   int    row;
   int    col;
   float  w = 0.0;
   float  avgBlockerDepth = 0;
   float  blockerCount = 0;
   float  fRatio;
   float4 v1[ FS2 + 1 ];
   float2 v0[ FS2 + 1 ];
   float2 off;

   fc     = stc - tcs;
   tc.xy  = tc.xy - ( (1.0f/SMAP_size) * fc );
   tc.z  -= 0.0001f;

#if defined(SM_4_1) || defined( SM_5)
    // find number of blockers and sum up blocker depth
    for( row = -BFS2; row <= BFS2; row += 2 )
    {
        for( col = -BFS2; col <= BFS2; col += 2 )
        {
            float4 d4 = s_smap.Gather( smp_nofilter, tc.xy, int2( col, row ) );
            float4 b4 = ( tc.zzzz <= d4 ) ? (0.0f).xxxx : (1.0f).xxxx;

            blockerCount += dot( b4, (1.0f).xxxx );
            avgBlockerDepth += dot( d4, b4 );
        }
    }
#else // SM_4_0
	uint vmask[ FS + 1 ];

    [unroll]for( col = 0; col <= FS; ++col )
		vmask[ col ] = uint(0);

	[unroll(11)]for( row = -FS2; row <= FS2; row +=2 )
    {
       [unroll]for( int col = -FS2; col <= FS2; col +=2 )
        {
            float4 d4;
            float  b;

			d4.w = s_smap.SampleLevel( smp_nofilter, tc.xy, 0, int2( col, row ) ).x;
			b = ( tc.z <= d4.w ) ? (0.0f) : (1.0f);
			vmask[ col + FS2 + 0 ] += ( ( tc.z <= d4.w ) ? ( uint(1) << uint( row + FS2 + 0 ) ) : uint(0) );
			blockerCount     += b;
            avgBlockerDepth  += d4.w * b;

			d4.z = s_smap.SampleLevel( smp_nofilter, tc.xy, 0, int2( col+1, row ) ).x;
			b = ( tc.z <= d4.z ) ? (0.0f) : (1.0f);
			vmask[ col + FS2 + 1 ] += ( ( tc.z <= d4.z ) ? ( uint(1) << uint( row + FS2 + 0 ) ) : uint(0) );
			blockerCount     += b;
            avgBlockerDepth  += d4.z * b;

			d4.x = s_smap.SampleLevel( smp_nofilter, tc.xy, 0, int2( col, row+1 ) ).x;
			vmask[ col + FS2 + 0 ] += ( ( tc.z <= d4.x ) ? ( uint(1) << uint( row + FS2 + 1 ) ) : uint(0) );
			b = ( tc.z <= d4.x ) ? (0.0f) : (1.0f);
			blockerCount     += b;
            avgBlockerDepth  += d4.x * b;

			d4.y = s_smap.SampleLevel( smp_nofilter, tc.xy, 0, int2( col+1, row+1 ) ).x;
			vmask[ col + FS2 + 1 ] += ( ( tc.z <= d4.y ) ? ( uint(1) << uint( row + FS2 + 1 ) ) : uint(0) );
			b = ( tc.z <= d4.y ) ? (0.0f) : (1.0f);
			blockerCount     += b;
            avgBlockerDepth  += d4.y * b;
        }
    }
#endif

   // compute ratio average blocker depth vs. pixel depth
   if( blockerCount > 0.0 )
   {
	   avgBlockerDepth /= blockerCount;
       fRatio = saturate( ( ( tc.z - avgBlockerDepth ) * SUN_WIDTH ) / avgBlockerDepth );
       fRatio *= fRatio;
   }
   else
   {
	   fRatio = 0.0;
   }

   for( row = 0; row < FS; ++row )
   {
      for( col = 0; col < FS; ++col )
         w += Fw(row,col,fRatio);
   }

    // filter shadow map samples using the dynamic weights
    [unroll(11)]for( row = -FS2; row <= FS2; row += 2 )
    {
        [unroll]for( int col = -FS2; col <= FS2; col += 2 )
        {
#if ( defined(SM_5) ) || ( defined(SM_4_1) )
#ifdef SM_5
            v1[(col+FS2)/2] = s_smap.GatherCmpRed( smp_smap, tc.xy, tc.z,
                                                   int2( col, row ) );
#else // SM_4_1
            float4 d4 = s_smap.Gather( smp_linear, tc.xy, int2( col, row ) );
            v1[(col+FS2)/2] = ( tc.zzzz <= d4 ) ? (1.0f).xxxx : (0.0f).xxxx;
#endif
#else
            v1[(col+FS2)/2].w = ( ( vmask[ col + FS2 + 0 ] & ( uint(1) << uint( row + FS2 + 0 ) ) ) ? 1.0f : 0.0f );
            v1[(col+FS2)/2].z = ( ( vmask[ col + FS2 + 1 ] & ( uint(1) << uint( row + FS2 + 0 ) ) ) ? 1.0f : 0.0f );
            v1[(col+FS2)/2].x = ( ( vmask[ col + FS2 + 0 ] & ( uint(1) << uint( row + FS2 + 1 ) ) ) ? 1.0f : 0.0f );
            v1[(col+FS2)/2].y = ( ( vmask[ col + FS2 + 1 ] & ( uint(1) << uint( row + FS2 + 1 ) ) ) ? 1.0f : 0.0f );
#endif
		  if( col == -FS2 )
		  {
			 s += ( 1 - fc.y ) * ( v1[0].w * ( Fw(row+FS2,0,fRatio) - Fw(row+FS2,0,fRatio) * fc.x ) + v1[0].z * ( fc.x * ( Fw(row+FS2,0,fRatio) - Fw(row+FS2,1,fRatio) ) +  Fw(row+FS2,1,fRatio) ) );
			 s += (     fc.y ) * ( v1[0].x * ( Fw(row+FS2,0,fRatio) - Fw(row+FS2,0,fRatio) * fc.x ) + v1[0].y * ( fc.x * ( Fw(row+FS2,0,fRatio) - Fw(row+FS2,1,fRatio) ) +  Fw(row+FS2,1,fRatio) ) );
			 if( row > -FS2 )
			 {
				s += ( 1 - fc.y ) * ( v0[0].x * ( Fw(row+FS2-1,0,fRatio) - Fw(row+FS2-1,0,fRatio) * fc.x ) + v0[0].y * ( fc.x * ( Fw(row+FS2-1,0,fRatio) - Fw(row+FS2-1,1,fRatio) ) +  Fw(row+FS2-1,1,fRatio) ) );
				s += (     fc.y ) * ( v1[0].w * ( Fw(row+FS2-1,0,fRatio) - Fw(row+FS2-1,0,fRatio) * fc.x ) + v1[0].z * ( fc.x * ( Fw(row+FS2-1,0,fRatio) - Fw(row+FS2-1,1,fRatio) ) +  Fw(row+FS2-1,1,fRatio) ) );
			 }
		  }
		  else if( col == FS2 )
		  {
			 s += ( 1 - fc.y ) * ( v1[FS2].w * ( fc.x * ( Fw(row+FS2,FS-2,fRatio) - Fw(row+FS2,FS-1,fRatio) ) + Fw(row+FS2,FS-1,fRatio) ) + v1[FS2].z * fc.x * Fw(row+FS2,FS-1,fRatio) );
			 s += (     fc.y ) * ( v1[FS2].x * ( fc.x * ( Fw(row+FS2,FS-2,fRatio) - Fw(row+FS2,FS-1,fRatio) ) + Fw(row+FS2,FS-1,fRatio) ) + v1[FS2].y * fc.x * Fw(row+FS2,FS-1,fRatio) );
			 if( row > -FS2 )
			 {
				s += ( 1 - fc.y ) * ( v0[FS2].x * ( fc.x * ( Fw(row+FS2-1,FS-2,fRatio) - Fw(row+FS2-1,FS-1,fRatio) ) + Fw(row+FS2-1,FS-1,fRatio) ) + v0[FS2].y * fc.x * Fw(row+FS2-1,FS-1,fRatio) );
				s += (     fc.y ) * ( v1[FS2].w * ( fc.x * ( Fw(row+FS2-1,FS-2,fRatio) - Fw(row+FS2-1,FS-1,fRatio) ) + Fw(row+FS2-1,FS-1,fRatio) ) + v1[FS2].z * fc.x * Fw(row+FS2-1,FS-1,fRatio) );
			 }
		  }
		  else
		  {
			 s += ( 1 - fc.y ) * ( v1[(col+FS2)/2].w * ( fc.x * ( Fw(row+FS2,col+FS2-1,fRatio) - Fw(row+FS2,col+FS2+0,fRatio) ) + Fw(row+FS2,col+FS2+0,fRatio) ) +
						           v1[(col+FS2)/2].z * ( fc.x * ( Fw(row+FS2,col+FS2-0,fRatio) - Fw(row+FS2,col+FS2+1,fRatio) ) + Fw(row+FS2,col+FS2+1,fRatio) ) );
			 s += (     fc.y ) * ( v1[(col+FS2)/2].x * ( fc.x * ( Fw(row+FS2,col+FS2-1,fRatio) - Fw(row+FS2,col+FS2+0,fRatio) ) + Fw(row+FS2,col+FS2+0,fRatio) ) +
						           v1[(col+FS2)/2].y * ( fc.x * ( Fw(row+FS2,col+FS2-0,fRatio) - Fw(row+FS2,col+FS2+1,fRatio) ) + Fw(row+FS2,col+FS2+1,fRatio) ) );
			 if( row > -FS2 )
			 {
				s += ( 1 - fc.y ) * ( v0[(col+FS2)/2].x * ( fc.x * ( Fw(row+FS2-1,col+FS2-1,fRatio) - Fw(row+FS2-1,col+FS2+0,fRatio) ) + Fw(row+FS2-1,col+FS2+0,fRatio) ) +
							          v0[(col+FS2)/2].y * ( fc.x * ( Fw(row+FS2-1,col+FS2-0,fRatio) - Fw(row+FS2-1,col+FS2+1,fRatio) ) + Fw(row+FS2-1,col+FS2+1,fRatio) ) );
				s += (     fc.y ) * ( v1[(col+FS2)/2].w * ( fc.x * ( Fw(row+FS2-1,col+FS2-1,fRatio) - Fw(row+FS2-1,col+FS2+0,fRatio) ) + Fw(row+FS2-1,col+FS2+0,fRatio) ) +
							          v1[(col+FS2)/2].z * ( fc.x * ( Fw(row+FS2-1,col+FS2-0,fRatio) - Fw(row+FS2-1,col+FS2+1,fRatio) ) + Fw(row+FS2-1,col+FS2+1,fRatio) ) );
			 }
	      }
		  if( row != FS2 )
			v0[(col+FS2)/2] = v1[(col+FS2)/2].xy;
	   }
   }

   return s/w;
}

float4 Fw( int r, int c )
{
	return float4( W0[r][c], W1[r][c], W2[r][c], 1.0f );
}

//======================================================================================
// This shader computes the contact hardening shadow filter
//======================================================================================
float shadow_extreme_quality_fused( float3 tc )
{
    float4 s   = (0.0f).xxxx;
    float2 stc = ( SMAP_size * tc.xy ) + float2( 0.5, 0.5 );
    float2 tcs = floor( stc );
    float2 fc;
    int    row;
    int    col;
    float  w = 0.0;
    float  avgBlockerDepth = 0;
    float  blockerCount = 0;
    float  fRatio;
    float4 v1[ FS2 + 1 ];
    float2 v0[ FS2 + 1 ];
    float2 off;

    fc     = stc - tcs;
    tc.xy  = tc.xy - ( fc * (1.0f/SMAP_size) );

    // filter shadow map samples using the dynamic weights
    [unroll(FS)]for( row = -FS2; row <= FS2; row += 2 )
    {
        for( col = -FS2; col <= FS2; col += 2 )
        {
            float4 d4;

#ifndef PS_4
            d4 = s_smap.Gather( smp_nofilter, tc.xy + (1.0f/SMAP_size) * float2( col, row ) );
#else
			d4.w = s_smap.SampleLevel( smp_nofilter, tc.xy + (1.0f/SMAP_size) * float2( col, row ), 0 ).x;
			d4.z = s_smap.SampleLevel( smp_nofilter, tc.xy + (1.0f/SMAP_size) * float2( col+1, row ) , 0 ).x;
			d4.y = s_smap.SampleLevel( smp_nofilter, tc.xy + (1.0f/SMAP_size) * float2( col+1, row+1 ), 0 ).x;
			d4.x = s_smap.SampleLevel( smp_nofilter, tc.xy + (1.0f/SMAP_size) * float2( col, row+1 ), 0 ).x;
#endif
            float4 b4  = ( tc.zzzz <= d4 ) ? (0.0f).xxxx : (1.0f).xxxx;

            v1[(col+FS2)/2] = ( tc.zzzz <= d4 ) ? (1.0f).xxxx : (0.0f).xxxx;
            blockerCount += dot( b4, (1.0).xxxx );
            avgBlockerDepth += dot( d4, b4 );

            if( col == -FS2 )
            {
                s += ( 1 - fc.y ) * ( v1[0].w * ( Fw(row+FS2,0) -
                                      Fw(row+FS2,0) * fc.x ) + v1[0].z *
                                    ( fc.x * ( Fw(row+FS2,0) -
                                      Fw(row+FS2,1) ) +
                                      Fw(row+FS2,1) ) );
                s += (     fc.y ) * ( v1[0].x * ( Fw(row+FS2,0) -
                                      Fw(row+FS2,0) * fc.x ) +
                                      v1[0].y * ( fc.x * ( Fw(row+FS2,0) -
                                      Fw(row+FS2,1) ) +
                                      Fw(row+FS2,1) ) );
                if( row > -FS2 )
                {
                    s += ( 1 - fc.y ) * ( v0[0].x * ( Fw(row+FS2-1,0) -
                                          Fw(row+FS2-1,0) * fc.x ) + v0[0].y *
                                        ( fc.x * ( Fw(row+FS2-1,0) -
                                          Fw(row+FS2-1,1) ) +
                                          Fw(row+FS2-1,1) ) );
                    s += (     fc.y ) * ( v1[0].w * ( Fw(row+FS2-1,0) -
                                          Fw(row+FS2-1,0) * fc.x ) + v1[0].z *
                                        ( fc.x * ( Fw(row+FS2-1,0) -
                                          Fw(row+FS2-1,1) ) +
                                          Fw(row+FS2-1,1) ) );
                }
            }
            else if( col == FS2 )
            {
                s += ( 1 - fc.y ) * ( v1[FS2].w * ( fc.x * ( Fw(row+FS2,FS-2) -
                                      Fw(row+FS2,FS-1) ) +
                                      Fw(row+FS2,FS-1) ) + v1[FS2].z * fc.x *
                                      Fw(row+FS2,FS-1) );
                s += (     fc.y ) * ( v1[FS2].x * ( fc.x * ( Fw(row+FS2,FS-2) -
                                      Fw(row+FS2,FS-1) ) +
                                      Fw(row+FS2,FS-1) ) + v1[FS2].y * fc.x *
                                      Fw(row+FS2,FS-1) );
                if( row > -FS2 )
                {
                    s += ( 1 - fc.y ) * ( v0[FS2].x * ( fc.x *
                                        ( Fw(row+FS2-1,FS-2) -
                                          Fw(row+FS2-1,FS-1) ) +
                                          Fw(row+FS2-1,FS-1) ) +
                                          v0[FS2].y * fc.x * Fw(row+FS2-1,FS-1) );
                    s += (     fc.y ) * ( v1[FS2].w * ( fc.x *
                                        ( Fw(row+FS2-1,FS-2) -
                                          Fw(row+FS2-1,FS-1) ) +
                                          Fw(row+FS2-1,FS-1) ) +
                                          v1[FS2].z * fc.x * Fw(row+FS2-1,FS-1) );
                }
            }
            else
            {
                s += ( 1 - fc.y ) * ( v1[(col+FS2)/2].w * ( fc.x *
                                    ( Fw(row+FS2,col+FS2-1) -
                                      Fw(row+FS2,col+FS2+0) ) +
                                      Fw(row+FS2,col+FS2+0) ) +
                                      v1[(col+FS2)/2].z * ( fc.x *
                                    ( Fw(row+FS2,col+FS2-0) -
                                      Fw(row+FS2,col+FS2+1) ) +
                                      Fw(row+FS2,col+FS2+1) ) );
                s += (     fc.y ) * ( v1[(col+FS2)/2].x * ( fc.x *
                                    ( Fw(row+FS2,col+FS2-1) -
                                      Fw(row+FS2,col+FS2+0) ) +
                                      Fw(row+FS2,col+FS2+0) ) +
                                      v1[(col+FS2)/2].y * ( fc.x *
                                    ( Fw(row+FS2,col+FS2-0) -
                                      Fw(row+FS2,col+FS2+1) ) +
                                      Fw(row+FS2,col+FS2+1) ) );
                if( row > -FS2 )
                {
                    s += ( 1 - fc.y ) * ( v0[(col+FS2)/2].x * ( fc.x *
                                        ( Fw(row+FS2-1,col+FS2-1) -
                                          Fw(row+FS2-1,col+FS2+0) ) +
                                          Fw(row+FS2-1,col+FS2+0) ) +
                                          v0[(col+FS2)/2].y * ( fc.x *
                                        ( Fw(row+FS2-1,col+FS2-0) -
                                          Fw(row+FS2-1,col+FS2+1) ) +
                                          Fw(row+FS2-1,col+FS2+1) ) );
                    s += (     fc.y ) * ( v1[(col+FS2)/2].w * ( fc.x *
                                        ( Fw(row+FS2-1,col+FS2-1) -
                                          Fw(row+FS2-1,col+FS2+0) ) +
                                          Fw(row+FS2-1,col+FS2+0) ) +
                                          v1[(col+FS2)/2].z * ( fc.x *
                                        ( Fw(row+FS2-1,col+FS2-0) -
                                          Fw(row+FS2-1,col+FS2+1) ) +
                                          Fw(row+FS2-1,col+FS2+1) ) );
                }
            }

            if( row != FS2 )
            {
                v0[(col+FS2)/2] = v1[(col+FS2)/2].xy;
            }
        }
    }

    // compute ratio using formulas from PCSS
    if( blockerCount > 0.0 )
    {
        avgBlockerDepth /= blockerCount;
        fRatio = saturate( ( ( tc.z - avgBlockerDepth ) * SUN_WIDTH ) / avgBlockerDepth );
        fRatio *= fRatio;
    }
    else
    {
        fRatio = 0.0;
    }

    // sum up weights of dynamic filter matrix
    for( row = 0; row < FS; ++row )
    {
       for( col = 0; col < FS; ++col )
       {
          w += Fw(row,col,fRatio);
       }
    }

    return dot(s, float4((1.0f-fRatio)*(1.0f-fRatio)*(1.0f-fRatio),
						 3.0f * (1.0-fRatio)*(1.0-fRatio)*fRatio,
						 3.0f * fRatio*fRatio*(1.0-fRatio),
						 fRatio*fRatio*fRatio ) )/w;
}
#endif

#ifdef SM_4_1

float dx10_1_hw_hq_7x7( float3 tc )
{
   float  s = 0.0f;
   float2 stc = ( SMAP_size * tc.xy ) + float2( 0.5, 0.5 );
   float2 tcs = floor( stc );
   float2 fc;
   int    row;
   int    col;

   fc.xy = stc - tcs;
   tc.xy = tcs * ( 1.0 / SMAP_size );

   // loop over the rows
   for( row = -GS2; row <= GS2; row += 2 )
   {
       [unroll]for( col = -GS2; col <= GS2; col += 2 )
       {
            float4 v = ( tc.zzzz <= s_smap.Gather( smp_nofilter, tc.xy, int2( col, row ) ) ) ? (1.0).xxxx : (0.0).xxxx;

            if( row == -GS2 ) // top row
            {
                if( col == -GS2 ) // left
                    s += dot( float4( 1.0-fc.x, 1.0, 1.0-fc.y, (1.0-fc.x)*(1.0-fc.y) ), v );
                else if( col == GS2 ) // right
                    s += dot( float4( 1.0f, fc.x, fc.x*(1.0-fc.y), 1.0-fc.y ), v );
                else // center
                    s += dot( float4( 1.0, 1.0, 1.0-fc.y, 1.0-fc.y ), v );
            }
            else if( row == GS2 )  // bottom row
            {
                if( col == -GS2 ) // left
                    s += dot( float4( (1.0-fc.x)*fc.y, fc.y, 1.0, (1.0-fc.x) ), v );
                else if( col == GS2 ) // right
                    s += dot( float4( fc.y, fc.x*fc.y, fc.x, 1.0 ), v );
                else // center
                    s += dot( float4(fc.yy,1.0,1.0), v );
            }
            else // center rows
            {
                if( col == -GS2 ) // left
                    s += dot( float4( (1.0-fc.x), 1.0, 1.0, (1.0-fc.x) ), v );
                else if( col == GS2 ) // right
                    s += dot( float4( 1.0, fc.x, fc.x, 1.0 ), v );
                else // center
                    s += dot( (1.0).xxxx, v );
            }
        }
   }

   return s*(1.0/49.0);
}

#endif

float dx10_0_hw_hq_7x7( float4 tc )
{
   tc.xyz /= tc.w;

   float  s   = 0.0;
   float2 stc = ( SMAP_size * tc.xy ) + float2( 0.5, 0.5 );
   float2 tcs = floor( stc );
   float2 fc;

   fc    = stc - tcs;
   tc.xy = tc.xy - ( fc * ( 1.0/SMAP_size ) );

   float2 pwAB = ( ( 2.0 ).xx - fc );
   float2 tcAB = ( 1.0/SMAP_size ).xx / pwAB;
   float2 tcM  = (0.5/SMAP_size ).xx;
   float2 pwGH = ( ( 1.0 ).xx + fc );
   float2 tcGH = (1.0/SMAP_size) * ( fc / pwGH );

   for( int row = -GS2; row <= GS2; row += 2 )
   {
      for( int col = -GS2; col <= GS2; col += 2 )
	  {
		if( row == -GS2 ) // top row
		{
			if( col == -GS2 ) // left
				s += ( pwAB.x * pwAB.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + tcAB, tc.z, int2( col, row ) ).x;
			else if( col == GS2 ) // right
				s += ( pwGH.x * pwAB.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcGH.x, tcAB.y), tc.z, int2( col, row ) ).x;
			else // center
				s += (    2.0 * pwAB.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcM.x, tcAB.y), tc.z, int2( col, row ) ).x;
		}
		else if( row == GS2 )  // bottom row
		{
			if( col == -GS2 ) // left
				s += ( pwAB.x * pwGH.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcAB.x, tcGH.y ), tc.z, int2( col, row ) ).x;
			else if( col == GS2 ) // right
				s += ( pwGH.x * pwGH.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + tcGH, tc.z, int2( col, row ) ).x;
			else // center
				s += (    2.0 * pwGH.y ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcM.x, tcGH.y ), tc.z, int2( col, row ) ).x;
		}
		else // center rows
		{
			if( col == -GS2 ) // left
				s += ( pwAB.x * 2.0    ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcAB.x, tcM.y ), tc.z, int2( col, row ) ).x;
			else if( col == GS2 ) // right
				s += ( pwGH.x * 2.0    ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + float2( tcGH.x, tcM.y),  tc.z, int2( col, row ) ).x;
			else // center
				s += (    2.0 * 2.0    ) * s_smap.SampleCmpLevelZero( smp_smap, tc.xy + tcM, tc.z, int2( col, row ) ).x;
		}
      }
	}

    return s/49.0;
}

#ifdef SM_MINMAX
bool cheap_reject( float3 tc, inout bool full_light )
{
   float4 plane0  = sm_minmax_gather( tc.xy, int2( -1,-1 ) );
   float4 plane1  = sm_minmax_gather( tc.xy, int2(  1,-1 ) );
   float4 plane2  = sm_minmax_gather( tc.xy, int2( -1, 1 ) );
   float4 plane3  = sm_minmax_gather( tc.xy, int2(  1, 1 ) );
   bool plane     = all( ( plane0 >= (0).xxxx ) * ( plane1 >= (0).xxxx ) * ( plane2 >= (0).xxxx ) * ( plane3 >= (0).xxxx ) );

   [flatten] if( !plane ) // if there are no proper plane equations in the support region
   {
      bool no_plane  = all( ( plane0 < (0).xxxx ) * ( plane1 < (0).xxxx ) * ( plane2 < (0).xxxx ) * ( plane3 < (0).xxxx ) );
      float4 z       = ( tc.z - 0.0005 ).xxxx;
      bool reject    = all( ( z > -plane0 ) * ( z > -plane1 ) * ( z > -plane2 ) * ( z > -plane3 ) );
      [flatten] if( no_plane && reject )
      {
         full_light = false;
         return true;
      }
      else
      {
         return false;
      }
   }
   else // plane equation detected
   {
      // compute corrected z for texel pos
      static const float scale = float( SMAP_size / 4 );
      float2 fc  = frac( tc.xy * scale );
      float  z   = lerp( lerp( plane0.y, plane1.x, fc.x ), lerp( plane2.z, plane3.w, fc.x ), fc.y );

      // do minmax test with new z
      full_light = ( ( tc.z - 0.0001 ) <= z );

      return true;
   }
}

#endif	//	SM_MINMAX

float shadow_hw_hq( float4 tc )
{
#ifdef SM_MINMAX
   bool   full_light = false;
   bool   cheap_path = cheap_reject( tc.xyz / tc.w, full_light );

   [branch] if( cheap_path )
   {
      [branch] if( full_light == true )
         return 1.0;
      else
         return sample_hw_pcf( tc, (0).xxxx );
   }
   else
   {
#if SUN_QUALITY>=4 // extreme quality
      return shadow_extreme_quality( tc.xyz / tc.w );
#else // SUN_QUALITY<4
#ifdef SM_4_1
      return dx10_1_hw_hq_7x7( tc.xyz / tc.w );
#else // SM_4_1
      return dx10_0_hw_hq_7x7( tc );
#endif // SM_4_1
#endif //SUN_QUALITY==4
   }
#else //	SM_MINMAX
#if SUN_QUALITY>=4 // extreme quality
      return shadow_extreme_quality( tc.xyz / tc.w );
#else // SUN_QUALITY<4
#ifdef SM_4_1
      return dx10_1_hw_hq_7x7( tc.xyz / tc.w );
#else // SM_4_1
      return dx10_0_hw_hq_7x7( tc );
#endif // SM_4_1
#endif //SUN_QUALITY==4
#endif //	SM_MINMAX
}

//////////////////////////////////////////////////////////////////////////////////////////
//	D24X8+PCF
//////////////////////////////////////////////////////////////////////////////////////////

float4 	test 		(float4 tc, float2 offset)
{

//	float4	tcx	= float4 (tc.xy + tc.w*offset, tc.zw);
//	return 	tex2Dproj (s_smap,tcx);

	tc.xyz 	/= tc.w;
	tc.xy 	+= offset;
	return s_smap.SampleCmpLevelZero( smp_smap, tc.xy, tc.z).x;
}

/*half 	shadowtest_sun 	(float4 tc, float4 tcJ)			// jittered sampling
{
	half4	r;

	const 	float 	scale 	= (0.5f/float(SMAP_size));

	float  	texsize = 2*SMAP_size;
	float2 	tc_J	= tc.xy/tc.w*texsize/8.0f;
	float2 	fr 		= frac(tc_J)*.5f;

//	half4	J0 	= tex2D	(jitter0,fr)*scale;
//	half4	J1 	= tex2D	(jitter1,fr)*scale*2;
	float4	J0 	= jitter0.Sample( smp_jitter, fr )*scale;
//	float4	J1 	= jitter1.Sample( smp_jitter, fr )*scale;

	float k = 0.99f/float(SMAP_size);
	r.x 	= test 	(tc,J0.xy+float2(-k,-k)).x;
	r.y 	= test 	(tc,J0.wz+float2( k,-k)).y;

 	r.z		= test	(tc,J0.xy+float2(-k, k)).z;
 	r.w		= test	(tc,J0.wz+float2( k, k)).x;

	half4	f;
	float k1 = 1.5f/float(SMAP_size);
	f.x 	= test 	(tc,-J0.xy+float2(-k1,0)).x;
	f.y 	= test 	(tc,-J0.wz+float2( 0,-k1)).y;

	f.z		= test	(tc,-J0.xy+float2( k1, 0)).z;
 	f.w		= test	(tc,-J0.wz+float2( 0, k1)).x;

	half res = ( r.x + r.y + r.z + r.w + f.x + f.y + f.z + f.w )*1.h/(4.h + 4.h );
	return res;
}*/
half 	shadowtest_sun 	(float4 tc, float4 tcJ)			// jittered sampling
{
	half4	r;

	//	const 	float 	scale 	= (2.0f/float(SMAP_size));
	const 	float 	scale 	= (0.7f/float(SMAP_size));


	float2 	tc_J	= frac(tc.xy/tc.w*SMAP_size/4.0f )*.5f;
	float4	J0		= jitter0.Sample(smp_jitter,tc_J)*scale;
	//half4	J1 	= tex2D	(jitter1,tc_J)*scale;

	const float k = .5f/float(SMAP_size);
	r.x 	= test 	(tc, J0.xy+half2(-k,-k)).x;
	r.y 	= test 	(tc, J0.wz+half2( k,-k)).y;
	r.z		= test	(tc,-J0.xy+half2(-k, k)).z;
	r.w		= test	(tc,-J0.wz+half2( k, k)).x;

	return	dot(r,1.h/4.h);
}

half 	shadow_high 	(float4 tc)			// jittered sampling
{

	const	float 	scale 	= (0.5f/float(SMAP_size));

	float2 	tc_J	= frac(tc.xy/tc.w*SMAP_size/4.0f )*.5f;
	float4	J0 		=	jitter0.Sample	(smp_jitter,tc_J)*scale;

	const float k = 1.f/float(SMAP_size);
	half4	r;
	r.x 	= test 	(tc,J0.xy+half2(-k,-k)).x;
	r.y 	= test 	(tc,J0.wz+half2( k,-k)).y;

	r.z		= test	(tc,J0.xy+half2(-k, k)).z;
	r.w		= test	(tc,J0.wz+half2( k, k)).x;


	const float k1 = 1.3f/float(SMAP_size);
	half4	r1;
	r1.x 	= test 	(tc,-J0.xy+half2(-k1,0)).x;
	r1.y 	= test 	(tc,-J0.wz+half2( 0,-k1)).y;

	r1.z	= test	(tc,-2*J0.xy+half2( k1, 0)).z;
	r1.w	= test	(tc,-2*J0.wz+half2( 0, k1)).x;

	return ( r.x + r.y + r.z + r.w + r1.x + r1.y + r1.z + r1.w )*1.h/8.h;
}

float shadow( float4 tc )
{
#ifdef USE_ULTRA_SHADOWS
#	ifdef SM_MINMAX
		return modify_light( shadow_hw_hq( tc ) );
#	else
		return shadow_hw_hq( tc );
#	endif
#else
#	if SUN_QUALITY>=2 // Hight quality
		//return shadowtest_sun 	( tc, float4(0,0,0,0) );			// jittered sampling;
		return shadow_hw		(tc);
#	else
		return shadow_hw		(tc);
#	endif
#endif
}

float shadow_volumetric( float4 tc )
{
	return sample_hw_pcf	(tc,float4(-1,-1,0,0));
}


#ifdef SM_MINMAX

//////////////////////////////////////////////////////////////////////////////////////////
// hardware + PCF
//////////////////////////////////////////////////////////////////////////////////////////

float shadow_dx10_1( float4 tc, float2 tcJ, float2 pos2d )
{
   return shadow( tc );
}

float shadow_dx10_1_sunshafts( float4 tc, float2 pos2d )
{
   float3 t         = tc.xyz / tc.w;
   float minmax     = s_smap_minmax.SampleLevel( smp_nofilter, t, 0 ).x;
   bool   umbra     = ( ( minmax.x < 0 ) && ( t.z > -minmax.x ) );

   [branch] if( umbra )
   {
      return 0.0;
   }
   else
   {
      return shadow_hw( tc );
   }
}

#endif


//////////////////////////////////////////////////////////////////////////////////////////
// testbed

//uniform sampler2D	jitter0;
//uniform sampler2D	jitter1;
float 	shadowtest 	(float4 tc, float4 tcJ)				// jittered sampling
{
	float4	r;

	const 	float 	scale 	= (2.7f/float(SMAP_size));

//	float4	J0 	= tex2Dproj	(jitter0,tcJ)*scale;
//	float4	J1 	= tex2Dproj	(jitter1,tcJ)*scale;
	tcJ.xy		/=	tcJ.w;
	float4	J0 	= jitter0.Sample( smp_jitter, tcJ )*scale;
	float4	J1 	= jitter1.Sample( smp_jitter, tcJ )*scale;

		r.x 	= test 	(tc,J0.xy).x;
		r.y 	= test 	(tc,J0.wz).y;
		r.z		= test	(tc,J1.xy).z;
		r.w		= test	(tc,J1.wz).x;

	return	dot(r,1.h/4.h);
}

float 	shadow_rain 	(float4 tc, float2 tcJ)			// jittered sampling
{
	float4	r;

	const 	float 	scale 	= (4.0f/float(SMAP_size));
//	float4	J0 	= jitter0.Sample( smp_jitter, tcJ )*scale;
//	float4	J1 	= jitter1.Sample( smp_jitter, tcJ )*scale;
	float4	J0 	= jitter0.Sample( smp_linear, tcJ )*scale;
	float4	J1 	= jitter1.Sample( smp_linear, tcJ )*scale;

	r.x 	= test 	(tc,J0.xy).x;
	r.y 	= test 	(tc,J0.wz).y;
	r.z		= test	(tc,J1.xy).z;
	r.w		= test	(tc,J1.wz).x;

//	float4	J0 	= jitterMipped.Sample( smp_base, tcJ )*scale;

//	r.x 	= test 	(tc,J0.xy).x;
//	r.y 	= test 	(tc,J0.wz).y;
//	r.z		= test	(tc,J0.yz).z;
//	r.w		= test	(tc,J0.xw).x;

	return	dot(r,1.h/4.h);
}

//////////////////////////////////////////////////////////////////////////////////////////
#ifdef  USE_SUNMASK
float3x4 m_sunmask;	// ortho-projection
float sunmask( float4 P )
{
	float2 		tc	= mul( m_sunmask, P );		//
//	return 		tex2D( s_lmap, tc ).w;			// A8
	return 		s_lmap.Sample( smp_linear, tc ).w;	// A8
}
#else
float sunmask( float4 P ) { return 1.h; }		//
#endif
//////////////////////////////////////////////////////////////////////////////////////////
uniform float4x4	m_shadow;

#endif