function [shader configstring] = EXPCreateStatic2DConvolutionShader(kernel, nrinputchannels, nroutchannels, debug, shadertype)
% [shader configstring] = EXPCreateStatic2DConvolutionShader(kernel [, nrinputchannels=3][, nroutchannels=3][, debug][, shadertype])
% 
% Creates a GLSL fragment shader for 2D convolution of textures with
% the 2D or 1D convolution kernel 'kernel' and returns a handle 'shader'
% to it, as well as a blitter config options string as needed by the
% PTB imaging pipeline.
%
% Usually you won't call this routine directly, but use
% Add2DConvolutionToGLOperator() instead. It allows convenient setup of
% convolution operators of arbitrary size and complexity to GLOperators
% for use with Screen('TransformTexture'); and for use with the stimulus
% post-processing pipeline. Only if you want to create small convolution
% kernels for single-pass convolution of textures, you'll use this routine
% directly and use the returned 'shader' as argument to Screen('DrawTexture')
% or Screen('MakeTexture') as 'textureShader' argument.
%
% If you call this function directly, the generated shader may only work with
% the onscreen window that is active at the time you call the function. To make
% sure that the shader works with the onscreen window 'win' of your choice, call
% Screen('GetWindowInfo', win); first, to activate the window 'win'.
%
%
% Parameters:
%
% 'kernel' is a simple 2D m-by-n matrix of floating point numbers with m and
% n being odd numbers, e.g., 1x1, 3x3, 5x5, 7x7, 9x9,..., 1x3, 1x9, 7x1 ...
% Each entry in the kernel is used as a weight factor.
%
% The simplest way to get a 2D kernel is to use the function
% kernel = fspecial(...); fspecial is part of the Matlab image
% processing toolbox, see "help fspecial" for more information.
%
% 'nrinputchannels' = The number of image channels to use as input for the
% convolution. Possible values are: 3 = Red, Green and Blue color channels are
% provided as part of a true-color image, don't use the alpha channel (if
% any) for convolution but just pass it through unmodified. 1 = The image
% only defines a luminance channel for convolution, an (optional) alpha
% channel is passed through unmodified. 4 = Use all four channels (Red,
% green, blue, alpha) for convolution.
%
% 'nroutchannels' = The number of channels to convolve as output: 3 =
% Convolve each of the three color channels red, green and blue separately
% by the same kernel. An (optional) alpha channel is passed through unmodified.
% 1 = Output a filtered luminance channel, and an (optional) unmodified
% alpha channel. If input is a 3 channel RGB image, then the RGB image will
% get converted to luminance before convolution. 4 = Filter all four
% channels independently.
%
% Typical settings:
%
% Filter a RGB(A) image: nrinputchannels = 3, nroutchannels = 3.
% Filter a RGB(A) image into grayscale: nrinputchannels = 3, nroutchannels = 1.
% Filter a luminance(A) image: inputchannels = 1, filteredoutchannels = 1.
% Generic filtering of 4-channel data: nrinputchannels = 4, nroutchannels = 4.
%
% 'debug' Optional debug flag: If set to non-zero, will output some debug
% info about the shader.
%
% 'shadertype' (Optional) The type of internal implementation to choose for
% the operator. This parameter is best left alone, unless you really know
% what you are doing.
%
% CAUTION:
%
% This feature is in early alpha stage. It may fail on your system and its
% future implementation may change significantly! Don't trust its results
% without validation against a known good reference implementation!
%
% NOTES: Filtermode 2 and 3 not fully implemented for the general case!
% ---> No separable 1D kernels, no support for all input->output mappings

% History:
% 27.11.2006 written by Mario Kleiner.

global GL;
persistent initialized;

if nargin < 1
    error('CreateStatic2DConvolutionShader: No kernel provided!');
end

if isempty(kernel)
    error('CreateStatic2DConvolutionShader: No kernel provided!');
end

% Query size of kernel:
kernelw = size(kernel,1);
kernelh = size(kernel,2);

% We only want odd sized kernels of at least 3x3, e.g., 3x3, 5x5, 7x7, ...
if kernelw < 1 || kernelh < 1 || mod(kernelw,2)~=1 || mod(kernelh,2)~=1
    error('CreateStatic2DConvolutionShader: Only odd-sized kernels of at least size 3x3 supported!');
end

if nargin < 2 || isempty(nrinputchannels)
    % Default to RGB channels as separate input channels:
    nrinputchannels = 3;
end

if nargin < 3 || isempty(nroutchannels)
    % Default to output of filtered RGB channels:
    nroutchannels = 3;
end

if nargin < 4 || isempty(debug);
    % No debug mode provided: Disable debug output by default:
    debug = 0;
end

if nargin < 5 || isempty(shadertype)
    % No explicit shadertype provided. We use some guess-o-matic here, that
    % tries to do its best for different types of hardware, depending on
    % kernel-size:
    
    % Query type of graphics hardware (vendor and model):
    shadertype = 3;
end

% Preinit configstring to empty - the most common setting:
configstring = [];

% First time invocation?
if isempty(initialized)
    % Make sure GLSL and fragmentshaders are supported on first call:
    AssertGLSL;

    % Query supported extensions:
    extensions = glGetString(GL.EXTENSIONS);
    if isempty(findstr(extensions, 'GL_ARB_fragment_shader'))
        % No fragment shaders: This is a no go!
        error('Sorry, this function does not work on your graphics hardware due to lack of sufficient support for fragment shaders.');
    end

    % Clear any OpenGL error state.
    while (glGetError~=GL.NO_ERROR); end;

    maxuniforms = glGetIntegerv(GL.MAX_FRAGMENT_UNIFORM_COMPONENTS);
    
    % We are initialized:
    initialized = 1;
end % of initialization.

% Compute half-width and linear shader coefficient array:
hwx = (kernelw - 1) / 2;
hwy = (kernelh - 1) / 2;
shaderkernel = reshape(kernel, kernelw * kernelh, 1);

if shadertype == 3
    % Generic shader, using a texture bound to 2nd unit as lookup table for
    % the convolution kernel. This method is *way* less efficient than
    % lookups of the kernel in internal uniform registers or compiled-in
    % constants, but it scales up to very large kernels, whereas the former
    % methods are limited in kernel size by how many constants can be
    % compiled into the shader or stored in uniform registers of a specific
    % GPU.

    % Load our shader for convolution blit operations:
    if nrinputchannels == 1
        % 1-channel case: Convolve Red/Luminance channel with kernel,
        % replicate result into RGB/Luminance channel. Pass alpha channel
        % unmodified:
        shader = LoadGLSLProgramFromFiles('Convolve2DRectTexture1ChannelShader.frag.txt', 1);
    else
        % 4-channel case: Convolve all 4 channels by same kernel:
        shader = LoadGLSLProgramFromFiles('Convolve2DRectTextureShader.frag.txt', 1);
    end
    
    % Assign proper texture units for input image and clut:
    glUseProgram(shader);
    shader_image = glGetUniformLocation(shader, 'Image');
    shader_clut  = glGetUniformLocation(shader, 'Kernel');
    shader_kernelsizeX  = glGetUniformLocation(shader, 'KernelHalfWidthX');
    shader_kernelsizeY  = glGetUniformLocation(shader, 'KernelHalfWidthY');

    glUniform1i(shader_image, 0);
    glUniform1i(shader_clut, 1);
    glUniform1f(shader_kernelsizeX, hwx);
    glUniform1f(shader_kernelsizeY, hwy);

    glUseProgram(0);

    % Shader ready. Setup 2nd texture unit as LUT unit for our kernel:
    
    % Select the 2nd texture unit (unit 1) for setup:
    glActiveTexture(GL.TEXTURE1);
    luttex = glGenTextures(1);
    glBindTexture(GL.TEXTURE_RECTANGLE_EXT, luttex);
    glTexImage2D(GL.TEXTURE_RECTANGLE_EXT, 0, GL.LUMINANCE_FLOAT32_APPLE, kernelw, kernelh, 0, GL.LUMINANCE, GL.FLOAT, moglsingle(shaderkernel));

    % Make sure we use nearest neighbour sampling:
    glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_MIN_FILTER, GL.NEAREST);
    glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_MAG_FILTER, GL.NEAREST);

    % And that we clamp to edge:
    glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_WRAP_S, GL.CLAMP);
    glTexParameteri(GL.TEXTURE_RECTANGLE_EXT, GL.TEXTURE_WRAP_T, GL.CLAMP);
    
    % Default CLUT setup done: Switch back to texture unit 0:
    glBindTexture(GL.TEXTURE_RECTANGLE_EXT, 0);
    glActiveTexture(GL.TEXTURE0);
    
    % Create shader config string so this lut texture gets used when
    % convolving:
    configstring = sprintf('TEXTURERECT2D(%i)=%i', 1, luttex);
end

if shadertype == 2
    % Generate a linear shader program, i.e., all loops are completely
    % unrolled. This may be a good workaround on old GPU's, but it didn't
    % yield any speed improvements on GF-7000 or X1600 ...
    
    % Header section:
    src = ['/* Statically compiled 2D convolution fragment shader for 2D rectangle textures.' char(10) ...
        '// Kernel Size is ' num2str(kernelw) ' by ' num2str(kernelh) char(10) ...
        '// Compiled by Psychtoolbox convolution shader metacompiler.'  char(10) ...
        '*/'  char(10)];

    src = [src '' char(10)  char(10) ...
        '#extension GL_ARB_texture_rectangle : enable' char(10) char(10) ...
        'uniform sampler2DRect Image;' char(10) char(10)];

    switch(nrinputchannels)
        case 1
            dtype  = '  float sum = float(0.0);';
            ifetch = '.r';
            douti  = '  gl_FragColor.rgb = vec3(sum); gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
        case 2
            dtype  = '  vec2 sum = vec2(0.0);';
            ifetch = '.rg';
            douti  = '  gl_FragColor.rg = sum; gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
        case 3
            dtype  = '  vec3 sum = vec3(0.0);';
            ifetch = '.rgb';
            douti  = '  gl_FragColor.rgb = sum; gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
        case 4
            dtype  = '  vec4 sum = vec4(0.0);';
            ifetch = '.rgba';
            douti  = '  gl_FragColor.rgba = sum;';
    end

    % Now for the program body:
    src =         [src ...
        'void main()' char(10)...
        '{' char(10) ...
        dtype char(10) char(10)];
    
    % Generate unrolled loop:
    i=0;
    for dy=-hwy:hwy
        for dx=-hwx:hwx
            i= i + 1;
%            src = [src '  sum += ' num2str(shaderkernel(i)) ' * texture2DRect(Image, gl_TexCoord[0].st + vec2(' num2str(dx) '.0, ' num2str(dy) '.0))' ifetch ';' char(10)];
            src = [src '  sum += ' sprintf('%.16f',shaderkernel(i)) ' * texture2DRect(Image, gl_TexCoord[0].st + vec2(' num2str(dx) '.0, ' num2str(dy) '.0))' ifetch ';' char(10)];
        end
    end

    % Generate tail:
    src = [src douti char(10) '}' char(10) char(10)];
    % Ok, we have our shader source string:
    if debug > 3
        src
    end

    % Create shader object, assign sourcecode and compile it:
    shandle = glCreateShader(GL.FRAGMENT_SHADER);
    if debug > 5
        glShaderSource(shandle, src, debug);
    else
        glShaderSource(shandle, src);
    end

    % This would abort if the shader would be invalid:
    glCompileShader(shandle);

    % Create new program object and get handle to it:
    shader = glCreateProgram;
    glAttachShader(shader, shandle);

    % Link it: This will abort if linker pass fails.
    glLinkProgram(shader);

    % Should be ready. Setup texture unit to be always unit zero:
    glUseProgram(shader);
    shader_image = glGetUniformLocation(shader, 'Image');
    glUniform1i(shader_image, 0);

    % Unbind program:
    glUseProgram(0);
end

if shadertype < 2
    % Precompiled shaders, not using textures as lookup tables:
    % Ok, now we synthesize a shader program source code string on the fly:

    % Header section:
    src = ['/* Statically compiled 2D convolution fragment shader for 2D rectangle textures.' char(10) ...
        '// Kernel Size is ' num2str(kernelw) ' by ' num2str(kernelh) char(10) ...
        '// Auto-Generated by Psychtoolbox convolution shader generator.'  char(10) ...
        '*/'  char(10)];
    
    src = [src '' char(10)  char(10) ...
        '#extension GL_ARB_texture_rectangle : enable' char(10) char(10) ...        
        'const int KernelHalfWidthX = ' num2str(hwx) ';' char(10) ...
        'const int KernelHalfWidthY = ' num2str(hwy) ';' char(10) ...
        'uniform sampler2DRect Image;' char(10)];

    if (nrinputchannels==3) && (nroutchannels==1)
        src = [src 'const vec3 rgb2grayweights = vec3(0.3, 0.59, 0.11);' char(10) ];
    end

    if shadertype == 0
        src = [src 'float kernel[' num2str(kernelw * kernelh) '];' char(10) char(10)];
    else
        src = [src 'uniform float kernel[' num2str(kernelw * kernelh) '];' char(10) char(10)];
    end

    switch(nrinputchannels)
        case 1 % Filter luminance channel only, but replicate filtered result to RGB. Alpha is passed through.
            dtype  = '  float tmp, sum = float(0.0);';
            ifetch = '      tmp = texture2DRect(Image, gl_TexCoord[0].st + vec2(float(dx), float(dy))).r;';
            iupdate= '      sum += tmp * kernel[i];';
            douti  = '  gl_FragColor.rgb = vec3(sum); gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
        case 2
            dtype  = '  vec2 tmp, sum = vec2(0.0);';
            ifetch = '      tmp = texture2DRect(Image, gl_TexCoord[0].st + vec2(float(dx), float(dy))).rg;';
            iupdate= '      sum += tmp * vec2(kernel[i]);';
            douti  = '  gl_FragColor.rg = sum; gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
        case 3 % Input is RGB. Alpha is passed through, ...
            if (nroutchannels == 3)
                % Filter each R,G,B channel separately...
                dtype  = '  vec3 tmp, sum = vec3(0.0);';
                ifetch = '      tmp = texture2DRect(Image, gl_TexCoord[0].st + vec2(float(dx), float(dy))).rgb;';
                iupdate= '      sum += tmp * vec3(kernel[i]);';
                douti  = '  gl_FragColor.rgb = sum; gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
            else
                % Convert RGB to Luminance, filter that, replicate result
                % into RGB out...
                dtype  = '  vec3 tmp; float sum = float(0.0);';
                ifetch = '      tmp = texture2DRect(Image, gl_TexCoord[0].st + vec2(float(dx), float(dy))).rgb;';
                iupdate= '      sum += dot(tmp, rgb2grayweights) * kernel[i];';
                douti  = '  gl_FragColor.rgb = vec3(sum); gl_FragColor.a = texture2DRect(Image, gl_TexCoord[0].st).a;';
            end
        case 4 % Filter all RGBA channels. Mostly useful for generic 4-channel data processing, not for imaging.
            dtype  = '  vec4 tmp, sum = vec4(0.0);';
            ifetch = '      tmp = texture2DRect(Image, gl_TexCoord[0].st + vec2(float(dx), float(dy))).rgba;';
            iupdate= '      sum += tmp * vec4(kernel[i]);';
            douti  = '  gl_FragColor.rgba = sum;';
    end

    % Now for the program body:
    src =         [src ...
        'void main()' char(10)...
        '{' char(10) ...
        '  int dx, dy, i;' char(10) ...
        dtype char(10) ...
        '  i=0;' char(10) ...
        ' '];

    % Type 0: Kernel is compiled into the shader as array of floating
    % point constants:
    if shadertype == 0
        % Output the kernel itself:
        for i=1:(kernelw*kernelh)
            src = sprintf('%s kernel[%i] = %.16f;', src, i-1, shaderkernel(i));
        end
    end

    src =         [src char(10) char(10) ...
        '  for (dy = -KernelHalfWidthY; dy <= KernelHalfWidthY; dy++) {' char(10) ...
        '    for (dx = -KernelHalfWidthX; dx <= KernelHalfWidthX; dx++) {' char(10) ...
        ifetch char(10) ...
        iupdate char(10) ...
        '      i++;' char(10) ...
        '    }' char(10) ...
        '  }' char(10) ...
        douti char(10) ...
        '}' char(10) char(10)];

    % Ok, we have our shader source string:
    if debug > 3
        fprintf(src);
    end

    % Create shader object, assign sourcecode and compile it:
    shandle = glCreateShader(GL.FRAGMENT_SHADER);
    if debug > 4
        glShaderSource(shandle, src, debug);
    else
        glShaderSource(shandle, src);
    end

    % This would abort if the shader would be invalid:
    glCompileShader(shandle);

    % Create new program object and get handle to it:
    shader = glCreateProgram;
    glAttachShader(shader, shandle);

    % Link it: This will abort if linker pass fails.
    glLinkProgram(shader);

    % Should be ready. Setup texture unit to be always unit zero:
    glUseProgram(shader);
    shader_image = glGetUniformLocation(shader, 'Image');
    glUniform1i(shader_image, 0);

    % For shadertype 1, set up uniforms with kernel:
    if shadertype == 1
        shader_kernel = glGetUniformLocation(shader, 'kernel[0]');

        % Output the kernel itself:
        for i=1:(kernelw*kernelh)
            glUniform1f(shader_kernel + i - 1, shaderkernel(i));
        end

    end

    % Unbind program:
    glUseProgram(0);
end

% Ok, shader should be ready for use with e.g., Screen('DrawTexture')...

% Check for errors:
err = glGetError;
if err
    fprintf('Error creating static 2D convolution shader! The GL returned: ');
    gluErrorString(err);
    shader = 0;
end

% Shader should be ready for use.
return;