Codebase list frei0r / f89ec56 src / filter / autothresh0ld / variance.h
f89ec56

Tree @f89ec56 (Download .tar.gz)

variance.h @f89ec56raw · history · blame

#ifdef __SSE2__
#include <emmintrin.h>
#endif

static float find_variance(float *hist, int thresh)
{
    float w_0 = 0.0, mu_0 = 0.0, w_1 = 0.0, mu_1 = 0.0;

    for (int i = 0; i < thresh; ++i)
    {
        w_0 += hist[i];
        mu_0 += i * hist[i];
    }
    for (int i = thresh; i < 256; ++i)
    {
        w_1 += hist[i];
        mu_1 += i * hist[i];
    }

    float mu_diff = (mu_0/w_0 - mu_1/w_1);
    return w_0*w_1*mu_diff*mu_diff;
}

#ifdef __SSE2__
static float _sse1_hadd_ps(__m128 v)
{
    // Based on a StackOverflow answer.
    __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
    __m128 sums = _mm_add_ps(v, shuf);
    shuf        = _mm_movehl_ps(shuf, sums);
    sums        = _mm_add_ss(sums, shuf);
    return _mm_cvtss_f32(sums);
}

static float find_variance_sse2(float *hist, int thresh)
{
    __m128 vw_0 = _mm_setzero_ps(), vmu_0 = _mm_setzero_ps(),
           vw_1 = _mm_setzero_ps(), vmu_1 = _mm_setzero_ps();
    __m128 vcnt = _mm_set_ps(0, 1, 2, 3);

    int thresh_low = (thresh / 4 + 0) * 4,
        thresh_up  = (thresh / 4 + 1) * 4;

    for (int i = 0; i < thresh_low; i+=4)
    {
        __m128 vhist = _mm_castsi128_ps(_mm_loadu_si128((void*)&hist[i]));

        vw_0  = _mm_add_ps(vhist, vw_0);
        vmu_0 = _mm_add_ps(_mm_mul_ps(vcnt, vhist), vw_0);
        vcnt  = _mm_add_ps(vcnt, _mm_set1_ps(4));
    }

    // This is skipped, and handled as an edge case by non-SSE code.
    vcnt = _mm_add_ps(vcnt, _mm_set1_ps(4));

    for (int i = thresh_up; i < 256; i+=4)
    {
        __m128 vhist = _mm_castsi128_ps(_mm_loadu_si128((void*)&hist[i]));

        vw_1  = _mm_add_ps(vhist, vw_1);
        vmu_1 = _mm_add_ps(_mm_mul_ps(vcnt, vhist), vw_1);
        vcnt  = _mm_add_ps(vcnt, _mm_set1_ps(4));
    }

    float w_0 = _sse1_hadd_ps(vw_0), mu_0 = _sse1_hadd_ps(vmu_0),
          w_1 = _sse1_hadd_ps(vw_1), mu_1 = _sse1_hadd_ps(vmu_1);

    // This edge case is here, because thresh may not be a multiple of 4.
    for (int i = thresh_low; i < thresh; ++i)
    {
        w_0 += hist[i];
        mu_0 += i * hist[i];
    }

    for (int i = thresh; i < thresh_up; ++i)
    {
        w_1 += hist[i];
        mu_1 += i * hist[i];
    }

    float mu_diff = (mu_0/w_0 - mu_1/w_1);
    return w_0*w_1*mu_diff*mu_diff;
}
#endif