snxxz_client.git

			@@ -1,91 +1,91 @@
			#include "UnityCG.cginc"

			RWStructuredBuffer<uint4> _Histogram;
			Texture2D<float4> _Source;

			CBUFFER_START (Params)
			uint _IsLinear;
			float4 _Res;
			uint4 _Channels;
			CBUFFER_END

			groupshared uint4 gs_histogram[256];

			#define GROUP_SIZE 16

			#pragma kernel KHistogramGather
			[numthreads(GROUP_SIZE, GROUP_SIZE,1)]
			void KHistogramGather(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
			{
			const uint localThreadId = groupThreadId.y * GROUP_SIZE + groupThreadId.x;

			if (localThreadId < 256)
			gs_histogram[localThreadId] = uint4(0, 0, 0, 0);

			GroupMemoryBarrierWithGroupSync();

			if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
			{
			// We want a gamma histogram (like Photoshop & all)
			float3 color = saturate(_Source[dispatchThreadId].xyz);
			if (_IsLinear > 0)
			color = LinearToGammaSpace(color);

			// Convert color & luminance to histogram bin
			uint3 idx_c = (uint3)(round(color * 255.0));
			uint idx_l = (uint)(round(dot(color.rgb, float3(0.2125, 0.7154, 0.0721)) * 255.0));

			// Fill the group shared histogram
			if (_Channels.x > 0u) InterlockedAdd(gs_histogram[idx_c.x].x, 1); // Red
			if (_Channels.y > 0u) InterlockedAdd(gs_histogram[idx_c.y].y, 1); // Green
			if (_Channels.z > 0u) InterlockedAdd(gs_histogram[idx_c.z].z, 1); // Blue
			if (_Channels.w > 0u) InterlockedAdd(gs_histogram[idx_l].w, 1); // Luminance
			}

			GroupMemoryBarrierWithGroupSync();

			// Merge
			if (localThreadId < 256)
			{
			uint4 h = gs_histogram[localThreadId];
			if (_Channels.x > 0u && h.x > 0) InterlockedAdd(_Histogram[localThreadId].x, h.x); // Red
			if (_Channels.y > 0u && h.y > 0) InterlockedAdd(_Histogram[localThreadId].y, h.y); // Green
			if (_Channels.z > 0u && h.z > 0) InterlockedAdd(_Histogram[localThreadId].z, h.z); // Blue
			if (_Channels.w > 0u && h.w > 0) InterlockedAdd(_Histogram[localThreadId].w, h.w); // Luminance
			}
			}

			// Scaling pass
			groupshared uint4 gs_pyramid[256];

			#pragma kernel KHistogramScale
			[numthreads(16,16,1)]
			void KHistogramScale(uint2 groupThreadId : SV_GroupThreadID)
			{
			const uint localThreadId = groupThreadId.y * 16 + groupThreadId.x;
			gs_pyramid[localThreadId] = _Histogram[localThreadId];

			GroupMemoryBarrierWithGroupSync();

			// Parallel reduction to find the max value
			UNITY_UNROLL
			for(uint i = 256 >> 1; i > 0; i >>= 1)
			{
			if(localThreadId < i)
			gs_pyramid[localThreadId] = max(gs_pyramid[localThreadId], gs_pyramid[localThreadId + i]);

			GroupMemoryBarrierWithGroupSync();
			}

			// Actual scaling
			float4 factor = _Res.y / (float4)gs_pyramid[0];
			_Histogram[localThreadId] = (uint4)round(_Histogram[localThreadId] * factor);
			}

			#pragma kernel KHistogramClear
			[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
			void KHistogramClear(uint2 dispatchThreadId : SV_DispatchThreadID)
			{
			if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
			_Histogram[dispatchThreadId.y * _Res.x + dispatchThreadId.x] = uint4(0u, 0u, 0u, 0u);
			}
			#include "UnityCG.cginc"

			RWStructuredBuffer<uint4> _Histogram;
			Texture2D<float4> _Source;

			CBUFFER_START (Params)
			uint _IsLinear;
			float4 _Res;
			uint4 _Channels;
			CBUFFER_END

			groupshared uint4 gs_histogram[256];

			#define GROUP_SIZE 16

			#pragma kernel KHistogramGather
			[numthreads(GROUP_SIZE, GROUP_SIZE,1)]
			void KHistogramGather(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
			{
			const uint localThreadId = groupThreadId.y * GROUP_SIZE + groupThreadId.x;

			if (localThreadId < 256)
			gs_histogram[localThreadId] = uint4(0, 0, 0, 0);

			GroupMemoryBarrierWithGroupSync();

			if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
			{
			// We want a gamma histogram (like Photoshop & all)
			float3 color = saturate(_Source[dispatchThreadId].xyz);
			if (_IsLinear > 0)
			color = LinearToGammaSpace(color);

			// Convert color & luminance to histogram bin
			uint3 idx_c = (uint3)(round(color * 255.0));
			uint idx_l = (uint)(round(dot(color.rgb, float3(0.2125, 0.7154, 0.0721)) * 255.0));

			// Fill the group shared histogram
			if (_Channels.x > 0u) InterlockedAdd(gs_histogram[idx_c.x].x, 1); // Red
			if (_Channels.y > 0u) InterlockedAdd(gs_histogram[idx_c.y].y, 1); // Green
			if (_Channels.z > 0u) InterlockedAdd(gs_histogram[idx_c.z].z, 1); // Blue
			if (_Channels.w > 0u) InterlockedAdd(gs_histogram[idx_l].w, 1); // Luminance
			}

			GroupMemoryBarrierWithGroupSync();

			// Merge
			if (localThreadId < 256)
			{
			uint4 h = gs_histogram[localThreadId];
			if (_Channels.x > 0u && h.x > 0) InterlockedAdd(_Histogram[localThreadId].x, h.x); // Red
			if (_Channels.y > 0u && h.y > 0) InterlockedAdd(_Histogram[localThreadId].y, h.y); // Green
			if (_Channels.z > 0u && h.z > 0) InterlockedAdd(_Histogram[localThreadId].z, h.z); // Blue
			if (_Channels.w > 0u && h.w > 0) InterlockedAdd(_Histogram[localThreadId].w, h.w); // Luminance
			}
			}

			// Scaling pass
			groupshared uint4 gs_pyramid[256];

			#pragma kernel KHistogramScale
			[numthreads(16,16,1)]
			void KHistogramScale(uint2 groupThreadId : SV_GroupThreadID)
			{
			const uint localThreadId = groupThreadId.y * 16 + groupThreadId.x;
			gs_pyramid[localThreadId] = _Histogram[localThreadId];

			GroupMemoryBarrierWithGroupSync();

			// Parallel reduction to find the max value
			UNITY_UNROLL
			for(uint i = 256 >> 1; i > 0; i >>= 1)
			{
			if(localThreadId < i)
			gs_pyramid[localThreadId] = max(gs_pyramid[localThreadId], gs_pyramid[localThreadId + i]);

			GroupMemoryBarrierWithGroupSync();
			}

			// Actual scaling
			float4 factor = _Res.y / (float4)gs_pyramid[0];
			_Histogram[localThreadId] = (uint4)round(_Histogram[localThreadId] * factor);
			}

			#pragma kernel KHistogramClear
			[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
			void KHistogramClear(uint2 dispatchThreadId : SV_DispatchThreadID)
			{
			if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
			_Histogram[dispatchThreadId.y * _Res.x + dispatchThreadId.x] = uint4(0u, 0u, 0u, 0u);
			}