/* -LICENSE-START-
 ** Copyright (c) 2023 Blackmagic Design
 **
 ** Permission is hereby granted, free of charge, to any person or organization
 ** obtaining a copy of the software and accompanying documentation (the
 ** "Software") to use, reproduce, display, distribute, sub-license, execute,
 ** and transmit the Software, and to prepare derivative works of the Software,
 ** and to permit third-parties to whom the Software is furnished to do so, in
 ** accordance with:
 **
 ** (1) if the Software is obtained from Blackmagic Design, the End User License
 ** Agreement for the Software Development Kit ("EULA") available at
 ** https://www.blackmagicdesign.com/EULA/DeckLinkSDK; or
 **
 ** (2) if the Software is obtained from any third party, such licensing terms
 ** as notified by that third party,
 **
 ** and all subject to the following:
 **
 ** (3) the copyright notices in the Software and this entire statement,
 ** including the above license grant, this restriction and the following
 ** disclaimer, must be included in all copies of the Software, in whole or in
 ** part, and all derivative works of the Software, unless such copies or
 ** derivative works are solely in the form of machine-executable object code
 ** generated by a source language processor.
 **
 ** (4) THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 ** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 ** FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 ** SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 ** FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ** ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 ** DEALINGS IN THE SOFTWARE.
 **
 ** A copy of the Software is available free of charge at
 ** https://www.blackmagicdesign.com/desktopvideo_sdk under the EULA.
 **
 ** -LICENSE-END-
 */

#include "commonMLDX.h"
#include "alpha.h"
#include "outputColorHelper.h"

inline texture2d<half, access::read> getInputTexture(
    uint gidy,
    texture2d<half, access::read> inTexture1,
    texture2d<half, access::read> inTexture2)
{
#ifdef INTERLACED
    return ((gidy & 1u) == 0u) ? inTexture1 : inTexture2;
#else
    return inTexture1;
#endif
}

kernel void BGRATo2vuyKernel8(
    texture2d<half, access::read>  inTexture1          [[texture(0)]],
    texture2d<half, access::read>  inTexture2          [[texture(1)]],
    texture2d<uint, access::write> outTexture          [[texture(2)]],
    uint2                          gid                 [[thread_position_in_grid]])
{
    uint2 baseCoord = gid * uint2(2, 1);
    texture2d<half, access::read> inTexture = getInputTexture(gid.y, inTexture1, inTexture2);

    // Load first pixel and convert
    float3 rgb0 = toOutput((float4)inTexture.read(baseCoord)).rgb;
    uint y0 = uint(round(rgb0.r * 255.0)) & 0xFF;

    // Load second pixel and convert
    float3 rgb1 = toOutput((float4)inTexture.read(baseCoord + uint2(1, 0))).rgb;
    uint y1 = uint(round(rgb1.r * 255.0)) & 0xFF;

    // Average for Cb and Cr
    float cb = 0.5 * (rgb0.g + rgb1.g);
    float cr = 0.5 * (rgb0.b + rgb1.b);

    uint cb_uint = uint(round(cb * 255.0)) & 0xFF;
    uint cr_uint = uint(round(cr * 255.0)) & 0xFF;

    // Pack into 32-bit AYUV format: Cb Y0 Cr Y1
    uint word0 = (cb_uint) | (y0 << 8) | (cr_uint << 16) | (y1 << 24);

    outTexture.write(word0, gid);
}


kernel void BGRATo2vuyKernel10(
    texture2d<half, access::read>  inTexture1          [[texture(0)]],
    texture2d<half, access::read>  inTexture2          [[texture(1)]],
    texture2d<uint, access::write> outTexture          [[texture(2)]],
    uint2                          gid                 [[thread_position_in_grid]])
{
    const uint2 baseCoord = gid * uint2(6, 1);
    texture2d<half, access::read> inTexture = getInputTexture(gid.y, inTexture1, inTexture2);

    float3 rgb0 = toOutput((float4)inTexture.read(baseCoord + uint2(0, 0))).rgb;
    float3 rgb1 = toOutput((float4)inTexture.read(baseCoord + uint2(1, 0))).rgb;
    float3 rgb2 = toOutput((float4)inTexture.read(baseCoord + uint2(2, 0))).rgb;
    float3 rgb3 = toOutput((float4)inTexture.read(baseCoord + uint2(3, 0))).rgb;
    float3 rgb4 = toOutput((float4)inTexture.read(baseCoord + uint2(4, 0))).rgb;
    float3 rgb5 = toOutput((float4)inTexture.read(baseCoord + uint2(5, 0))).rgb;

    // Luma (Y) values
    uint y0 = uint(round(rgb0.r * 1023.0)) & 0x3FF;
    uint y1 = uint(round(rgb1.r * 1023.0)) & 0x3FF;
    uint y2 = uint(round(rgb2.r * 1023.0)) & 0x3FF;
    uint y3 = uint(round(rgb3.r * 1023.0)) & 0x3FF;
    uint y4 = uint(round(rgb4.r * 1023.0)) & 0x3FF;
    uint y5 = uint(round(rgb5.r * 1023.0)) & 0x3FF;

    // Average chroma
    float cb0 = 0.5 * (rgb0.g + rgb1.g);
    float cr0 = 0.5 * (rgb0.b + rgb1.b);
    float cb1 = 0.5 * (rgb2.g + rgb3.g);
    float cr1 = 0.5 * (rgb2.b + rgb3.b);
    float cb2 = 0.5 * (rgb4.g + rgb5.g);
    float cr2 = 0.5 * (rgb4.b + rgb5.b);

    // Chroma to 10-bit
    uint cb0u = uint(round(cb0 * 1023.0)) & 0x3FF;
    uint cr0u = uint(round(cr0 * 1023.0)) & 0x3FF;
    uint cb1u = uint(round(cb1 * 1023.0)) & 0x3FF;
    uint cr1u = uint(round(cr1 * 1023.0)) & 0x3FF;
    uint cb2u = uint(round(cb2 * 1023.0)) & 0x3FF;
    uint cr2u = uint(round(cr2 * 1023.0)) & 0x3FF;

    // Pack 10-bit fields into 32-bit words
    uint word0 = (cb0u     ) | (y0 << 10) | (cr0u << 20);
    uint word1 = (y1       ) | (cb1u << 10) | (y2 << 20);
    uint word2 = (cr1u     ) | (y3 << 10) | (cb2u << 20);
    uint word3 = (y4       ) | (cr2u << 10) | (y5 << 20);

    // Write output
    uint baseX = gid.x * 4;
    outTexture.write(word0, uint2(baseX + 0, gid.y));
    outTexture.write(word1, uint2(baseX + 1, gid.y));
    outTexture.write(word2, uint2(baseX + 2, gid.y));
    outTexture.write(word3, uint2(baseX + 3, gid.y));
}

uint byte_reverse(uint x) {
    return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
}
// Convert and pack BGRA into 2vuy compute kernel
kernel void BGRATo2vuyaKernel10(
    texture2d<half, access::read>  inTexture1          [[texture(0)]],
    texture2d<half, access::read>  inTexture2          [[texture(1)]],
    texture2d<uint, access::write> outTexture          [[texture(2)]],
    uint2                          gid                 [[thread_position_in_grid]])
{
    texture2d<half, access::read> inTexture = getInputTexture(gid.y, inTexture1, inTexture2);

    const uint2 bgraBlockGid = gid * uint2(2, 1);

    float cb[2], cr[2];
    uint y[2];
    uint a[2];

    for (int i = 0; i < 2; ++i)
    {
       float4 rgb = toOutput((float4)inTexture.read(bgraBlockGid + uint2(i, 0))).rgba;
       y[i] = uint(round(rgb.r * 1023.0)) & 0x3FF;
       a[i] = uint(round(rgb.a * 1023.0)) & 0x3FF;

       int pair = i / 2;
       if (i % 2 == 0)
       {
           cb[pair] = rgb.g;
           cr[pair] = rgb.b;
       }
       else
       {
           cb[pair] = (cb[pair] + rgb.g) * 0.5;
           cr[pair] = (cr[pair] + rgb.b) * 0.5;
       }
    }

    uint cb_uint[1], cr_uint[1];
    for (int i = 0; i < 1; ++i)
    {
       cb_uint[i] = uint(round(cb[i] * 1023.0)) & 0x3FF;
       cr_uint[i] = uint(round(cr[i] * 1023.0)) & 0x3FF;
    }

    // less performant but should be a sure thing
    uint word0 = y[0] | (cb_uint[0] << 10) | (a[0] << 20);
    uint word1 = y[1] | (cr_uint[0] << 10) | (a[1] << 20);

    uint baseX = gid.x * 2;
    outTexture.write(byte_reverse(word0), uint2(baseX + 0, gid.y));
    outTexture.write(byte_reverse(word1), uint2(baseX + 1, gid.y));

    // more performant but not a sure thing, switch to this after testing on DeckLink 4K mini
//    uint word0 = (a[0]>>4) | ((a[0]&0xf)<<12) | ((cb_uint[0]>>6)<<8) | ((y[0]>>8)<<16) | ((cb_uint[0]&0x3f)<<18) | ((y[0]&0xff)<<24);
//    uint word1 = (a[1]>>4) | ((a[1]&0xf)<<12) | ((cr_uint[0]>>6)<<8) | ((y[1]>>8)<<16) | ((cr_uint[0]&0x3f)<<18) | ((y[1]&0xff)<<24);
//
//    uint baseX = gid.x * 2;
//    outTexture.write(word0, uint2(baseX + 0, gid.y));
//    outTexture.write(word1, uint2(baseX + 1, gid.y));
}

kernel void BRGACopy(
    texture2d<half, access::read>  inTexture1          [[texture(0)]],
    texture2d<half, access::read>  inTexture2          [[texture(1)]],
    texture2d<half, access::write> outTexture          [[texture(2)]],
    uint2                          gid                 [[thread_position_in_grid]])
{
    texture2d<half, access::read> inTexture = getInputTexture(gid.y, inTexture1, inTexture2);
    float4 pixel = toOutput((float4)inTexture.read(gid));
    outTexture.write(half4(pixel.grab), gid);
}
