function remapCLUTId = PsychHelperCreateRGB111110RemapCLUTAMDDCE8
% remapCLUTId = PsychHelperCreateRGB111110RemapCLUTAMDDCE8;
%
% Helper function for Psychtoolbox imaging pipeline, called by
% PsychImaging(), not meant to be called by normal user code!
%
% Build a 3 rows by 2048 texels RGBA8 lookup texture for mapping HDR RGB
% pixel values in range 0-2047 (ie. 11 bpc resolution) to RGBA8 framebuffer
% pixels which will be suitable to drive a BGR10-10-11 framebuffer scanout
% engine (CRTC). If a graphics cards framebuffer scanout engine CRTC is
% configured for 10-11-11 ~ 11 bpc framebuffer mode, this will make sure that
% HDR pixel data is shown properly onscreen.
%
% This texture is used as CLUT texture for the
% RGBMultiLUTLookupCombine_FormattingShader.frag.txt in the final output
% formatting chain of the imaging pipeline.
%
% The expected pixel layout in 11bpc mode is: BGR101111, ie.:
% R11 G11 B10 -- That's how a 32 bit pixel is interpreted for video
% scanout. However the OpenGL system on all current OS with standard
% drivers can only output A8R8G8B8 formatted pixels. We solve this via the
% imaging pipeline: Screen() and OpenGL drawing ops go to the imaging
% pipelines virtual framebuffer with 16bpc float or 32 bpc float precision.
% At Flip time, our shader remaps that 0.0 - 1.0 values to the range 0-2047
% ie. approx. 11 bpc, then uses the CLUT texture created with this function to
% lookup corresponding ARGB8 color tuples for the framebuffer, combines
% them into a single ARGB8 tuple by addition (logical OR), then writes that
% ARGB tuple to the framebuffer. At display scanout time, the CRTC's will
% find 32 bit pixels properly formatted for BGR101111 scanout.
%
% This requires a suitable graphics cards with an at least 11 bpc capable
% display pipeline. This is the case for recent AMD gpu's of the "Sea Islands"
% and "Volcanic Islands" family with a DCE-8 or later display engine. Those are
% said to have full 12 bpc pipelines.
%
% History:
%  8-Jun-2014  Written - Derived from PsychHelperCreateARGB2101010RemapCLUT.m (MK).
% 25-May-2016  Fixed up for DCE-10 "Volcanic Islands" Radeon R9 380 Tonga Pro (MK).
%              Turns out they need very different formatting and our original
%              implementation on non-12 bpc capable gpu's caused wrong test results.

% This routine assumes that a mogl GL context is properly set up:
global GL;

% Try to get LUT from cached .mat file -- faster:
cachedFile = [PsychtoolboxConfigDir 'rgb111110remaplut_DCE8.mat'];

if exist(cachedFile, 'file')
    % Get cached LUT from file:
    load(cachedFile);
else
    % Build lookup table: 3 LUT's (Red,Green,Blue) with 2048 slots with 4
    % output color components RGBA8 per entry, ie., a 32 bpp value:
    clut = uint8(zeros(1, 3 * 2048 * 4));

    % Startindex for 1st LUT entry:
    j=1;

    % This is the correct input to output color channels/bits mapping for the
    % BGR10-11-11 scanout with 32bpp format code 7 as used by the GPU.
    %
    %  Input:  -> Output
    %    B2:9  -> A0:7
    %    B0:1  -> R6:7
    %    G5:10 -> R0:5
    %    G0:4  -> G3:7
    %    R8:10 -> G0:2
    %    R0:7  -> B0:7

    % First the 11 bpc, 2048 levels RED LUT: Texrow 1
    for i=0:2047
        % Memory order for texture creation is RGBA:

        % Nothing in R:
        clut(j)= 0;
        j=j+1;
        % R8:10 -> G0:2:
        clut(j)= bitshift(bitand(i, bin2dec('11100000000')), -8);
        j=j+1;
        % R0:7  -> B0:7
        clut(j)= bitshift(bitand(i, bin2dec('00011111111')), +0);
        j=j+1;
        % Nothing in A:
        clut(j)= 0;
        j=j+1;
    end

    % Then the 11 bpc, 2048 levels GREEN LUT: Texrow 2
    for i=0:2047
        % Memory order for texture creation is RGBA:

        % G5:10 -> R0:5:
        clut(j)= bitshift(bitand(i, bin2dec('11111100000')), -5);
        j=j+1;
        % G0:4  -> G3:7:
        clut(j)= bitshift(bitand(i, bin2dec('00000011111')), +3);
        j=j+1;
        % Nothing in B:
        clut(j)= 0;
        j=j+1;
        % Nothing in A:
        clut(j)= 0;
        j=j+1;
    end

    % Last the only 10 bpc, only 1024 levels BLUE LUT: Texrow 3
    for i=0:1023
        % Memory order for texture creation is RGBA:

        % We only have 1024 intensity steps 'i', but because this
        % sub-LUT also has 2048 slots, we fill every two consecutive
        % slots with the same values, ie., do the fill process twice for
        % each value or step of i:
        for repeatsteap = 1:2
            % B0:1 - > R6:7
            clut(j)= bitshift(bitand(i, bin2dec('0000000011')), +6);
            j=j+1;
            % Nothing in G:
            clut(j)= 0;
            j=j+1;
            % Nothing in B:
            clut(j)= 0;
            j=j+1;
            % B2:9  -> A0:7:
            clut(j)= bitshift(bitand(i, bin2dec('1111111100')), -2);
            j=j+1;
        end
    end

    % Cache computed LUT in users config dir:
    save(cachedFile, 'clut', '-mat', '-V6');
end

% Hostformat defines byte-order in host memory:
% At increasing memory locations for each texel first R, then G, then B, then A.
hostformat = GL.UNSIGNED_BYTE;

% Create and setup texture from 'clut':
remapCLUTId = glGenTextures(1);
glBindTexture(GL.TEXTURE_RECTANGLE_EXT, remapCLUTId);
glTexImage2D(GL.TEXTURE_RECTANGLE_EXT, 0, GL.RGBA8, 2048, 3, 0, GL.RGBA, hostformat, clut);

% Make sure we use nearest neighbour sampling:
glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_MIN_FILTER, GL.NEAREST);
glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_MAG_FILTER, GL.NEAREST);

% And that we clamp to edge:
glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_WRAP_S, GL.CLAMP);
glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_WRAP_T, GL.CLAMP);

glBindTexture(GL.TEXTURE_RECTANGLE_EXT, 0);

% Ready.
return;