#include "metalShaderTypes.h"
#include "alpha.h"
#include "commonMLDX.h"
#include "outputColorHelper.h"

constant float4 std709X = float4(  0.183,   0.614,  0.062, 0.0  );
constant float4 std709Y = float4( -0.101,  -0.338,  0.439, 0.0  );
constant float4 std709Z = float4(  0.439,  -0.399, -0.040, 0.0  );


vertex vertexOut vertexFunc(uint vertexID [[ vertex_id ]],
                                   const device pixCopyVertexFormat* in [[ buffer(0) ]])
{
    vertexOut out;

    out.position = in[vertexID].pos;

    out.tex1 = in[vertexID].tc;
    out.tex2 = in[vertexID].tc;
    out.tex2.x+=1.0;

    return out;
}

fragment float4 fragmentFunc(vertexOut input [[stage_in]],
                             texture2d<half> inputTex0 [[ texture(0) ]])
{
    float2 texSize=float2(inputTex0.get_width(),inputTex0.get_height());

    //this shader is for 420 and we are gathering 2 UV samples and interleaving them
    //that would mean that we need to sample 8 pixels to do that, but we can take advantage of bilinear filtering
    //provided by the GPU to do that for us.  We offset the sample coordinate by half pixel right and half line down
    //that'll let us get the average of a 2x2 pixel area with one read
    //we're reading twice because we're writing out 2 UV samples
    float2 texCoord=input.tex1+float2(.5,.5);

    float4 texColor0 = toOutput(float4(inputTex0.sample(linearSampler,texCoord/texSize)));
    texCoord.x+=1.0;
    float4 texColor1 = toOutput(float4(inputTex0.sample(linearSampler,texCoord/texSize)));

    float4 yuv0 = float4(dot(std709X, texColor0)+0.0625,
                         dot(std709Y, texColor0)+0.5,
                         dot(std709Z, texColor0)+0.5,
                         1.0 );

    float4 yuv1 = float4(dot(std709X, texColor1)+0.0625,
                         dot(std709Y, texColor1)+0.5,
                         dot(std709Z, texColor1)+0.5,
                         1.0 );



    float4 outColor;
    outColor=float4(yuv0.g,yuv0.b,yuv1.g,yuv1.b);
    return outColor;
}

