#ifndef INC_IMAGE
#define INC_IMAGE

#include <iostream>
#include <cstdlib>
#include <Eigen/Eigen>

#include "stb_image.h"
#include "stb_image_write.h"

using namespace Eigen;

typedef Array<float, Dynamic, Dynamic, RowMajor> FImage;

inline float min(float a, float b)
{
    return a < b ? a : b;
}

inline float max(float a, float b)
{
    return a < b ? b : a;
}

template <typename Derived>
FImage pad(const DenseBase<Derived> &a, int xpad, int ypad)
{
    FImage r;
    r.setZero(a.rows() + ypad * 2, a.cols() + xpad * 2);
    r.block(ypad, xpad, a.rows(), a.cols()) = a;
    return r;
}

template <typename Derived>
void save(const DenseBase<Derived> &a, const char *path)
{
    int n = a.size();
    unsigned char *d = (unsigned char *)malloc(n * sizeof(unsigned char));
    for(int i = 0; i < n; i++)
        d[i] = a(i) > 255 ? 255 : (a(i) < 0 ? 0 : (unsigned char)a(i));
    stbi_write_png(path, a.cols(), a.rows(), 1, d, a.cols());
    free(d);
}

/// Convolutions

template <int kw, int kh, typename Derived1, typename Derived2>
void convolveInto(const DenseBase<Derived1> &in, DenseBase<Derived2> &out, const Array<float, kh, kw> &kernel)
{
    assert(in.cols() == out.cols() && in.rows() == out.rows());
    
    const int hkw = kw / 2, hkh = kh / 2;
    
    auto singleConv = [&](int i, int j)
    {
        int ki = max(0, hkh - i), kj = max(0, hkw - j),
            ki2 = kh - max(0, i - (in.rows() - hkh - 1)),
            kj2 = kw - max(0, j - (in.cols() - hkw - 1));
        return (kernel.block(ki, kj, ki2 - ki, kj2 - kj) *
            in.block(max(0, i - hkh), max(0, j - hkw), ki2 - ki, kj2 - kj).array()).sum();
    };
    
    for(int i = 0; i < in.rows(); i++)
        for(int j = 0; j < in.cols(); j++)
            out(i, j) = singleConv(i, j);
}

template <int kw, int kh, typename Derived>
FImage convolve(const DenseBase<Derived> &in, const Array<float, kh, kw> &kernel)
{
    FImage out(in.rows(), in.cols());
    convolveInto<kw, kh>(in, out, kernel);
    return out;
}

/// Separable convolutions

// Returns the element that was shifted out of the array
template <int kw>
float shiftLeftInPlace(Array<float, 1, kw> &a)
{
    float r = a(0);
    for(int j = 0; j < kw - 1; j++)
        a(j) = a(j + 1);
    return r;
}

template <int kw, int kh, typename Derived1, typename Derived2>
void convolveSepInto(const DenseBase<Derived1> &in, DenseBase<Derived2> &out, const Array<float, 1, kw> &kernel1, const Array<float, kh, 1> &kernel2)
{
    assert(in.cols() == out.cols() && in.rows() == out.rows());
    
    const int hkw = kw / 2, hkh = kh / 2;
    
    // Apply vertical filter to in and store into out
    for(int j = 0; j < in.cols(); j++)
    {
        int i = 0;
        for(; i < hkh; i++)
            out(i, j) = in.block(0, j, hkh + 1 + i, 1).col(0).matrix().dot(kernel2.tail(hkh + 1 + i).matrix());
        for(; i < in.rows() - hkh; i++)
            out(i, j) = in.template block<kh, 1>(i - hkh, j).col(0).matrix().dot(kernel2.matrix());
        for(int ki = 0; ki < hkh; ki++, i++)
            out(i, j) = in.block(i - hkh, j, kh - 1 - ki, 1).col(0).matrix().dot(kernel2.head(kh - 1 - ki).matrix());
    }
    
    // Apply horizontal filter to out in-place
    Array<float, 1, hkw + 1> buf;
    for(int i = 0; i < in.rows(); i++)
    {
        int j = 0;
        // Fill buf
        for(; j < hkw; j++)
            buf(j) = out.block(i, 0, 1, hkw + 1 + j).row(0).matrix().dot(kernel1.tail(hkw + 1 + j).matrix());
        
        // buf now acts as a shift register for the rest of the row
        for(; j < in.cols() - hkw; j++)
        {
            buf(hkw) = out.template block<1, kw>(i, j - hkw).row(0).matrix().dot(kernel1.matrix());
            out(i, j - hkw) = shiftLeftInPlace(buf);
        }
        for(int kj = 0; kj < hkw; kj++, j++)
        {
            buf(hkw) = out.block(i, j - hkw, 1, kw - 1 - kj).row(0).matrix().dot(kernel1.head(kw - 1 - kj).matrix());
            out(i, j - hkw) = shiftLeftInPlace(buf);
        }
        out.template block<1, hkw>(i, in.cols() - hkw) = buf.head(hkw);
    }
}

template <int kw, int kh, typename Derived>
FImage convolveSep(const DenseBase<Derived> &in, const Array<float, 1, kw> &kernel1, const Array<float, kh, 1> &kernel2)
{
    FImage out(in.rows(), in.cols());
    convolveSepInto<kw, kh>(in, out, kernel1, kernel2);
    return out;
}

FImage loadImage(const char *path)
{
    int w, h;
    unsigned char *data = (unsigned char *)stbi_load(path, &w, &h, NULL, 1);
    FImage img(h, w);
    for(int i = 0; i < w * h; i++)
        img.data()[i] = (float)data[i];
    STBI_FREE(data);
    return img;
}

#endif
