#line 1 "numpy/core/src/umath/loops_arithm_fp.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** sse2 avx2 avx512f
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

// TODO: replace raw SIMD with NPYV
//###############################################################################
//## Real Single/Double precision
//###############################################################################
/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
#ifdef NPY_HAVE_SSE2
#line 41
#line 47
static void
sse2_binary_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 c = _mm512_add_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 b = _mm512_load_ps(&ip2[i]);
                __m512 c = _mm512_add_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 c = _mm512_add_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 b = _mm512_loadu_ps(&ip2[i]);
                __m512 c = _mm512_add_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 c = _mm256_add_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 b = _mm256_load_ps(&ip2[i]);
                __m256 c = _mm256_add_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 c = _mm256_add_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 b = _mm256_loadu_ps(&ip2[i]);
                __m256 c = _mm256_add_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 c = _mm_add_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 b = _mm_load_ps(&ip2[i]);
                __m128 c = _mm_add_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 c = _mm_add_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 b = _mm_loadu_ps(&ip2[i]);
                __m128 c = _mm_add_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[i];
    }
}

static void
sse2_binary_scalar1_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 a = _mm512_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 a = _mm256_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 a = _mm_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] + ip2[i];
    }
}

static void
sse2_binary_scalar2_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 b = _mm512_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 b = _mm256_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 c = _mm256_add_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 b = _mm_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 c = _mm_add_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[0];
    }
}


#line 47
static void
sse2_binary_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 c = _mm512_sub_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 b = _mm512_load_ps(&ip2[i]);
                __m512 c = _mm512_sub_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 c = _mm512_sub_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 b = _mm512_loadu_ps(&ip2[i]);
                __m512 c = _mm512_sub_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 c = _mm256_sub_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 b = _mm256_load_ps(&ip2[i]);
                __m256 c = _mm256_sub_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 c = _mm256_sub_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 b = _mm256_loadu_ps(&ip2[i]);
                __m256 c = _mm256_sub_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 c = _mm_sub_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 b = _mm_load_ps(&ip2[i]);
                __m128 c = _mm_sub_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 c = _mm_sub_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 b = _mm_loadu_ps(&ip2[i]);
                __m128 c = _mm_sub_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[i];
    }
}

static void
sse2_binary_scalar1_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 a = _mm512_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 a = _mm256_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 a = _mm_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] - ip2[i];
    }
}

static void
sse2_binary_scalar2_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 b = _mm512_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 c = _mm512_sub_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 b = _mm256_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 c = _mm256_sub_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 b = _mm_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 c = _mm_sub_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[0];
    }
}


#line 47
static void
sse2_binary_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 c = _mm512_mul_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 b = _mm512_load_ps(&ip2[i]);
                __m512 c = _mm512_mul_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 c = _mm512_mul_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 b = _mm512_loadu_ps(&ip2[i]);
                __m512 c = _mm512_mul_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 c = _mm256_mul_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 b = _mm256_load_ps(&ip2[i]);
                __m256 c = _mm256_mul_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 c = _mm256_mul_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 b = _mm256_loadu_ps(&ip2[i]);
                __m256 c = _mm256_mul_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 c = _mm_mul_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 b = _mm_load_ps(&ip2[i]);
                __m128 c = _mm_mul_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 c = _mm_mul_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 b = _mm_loadu_ps(&ip2[i]);
                __m128 c = _mm_mul_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[i];
    }
}

static void
sse2_binary_scalar1_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 a = _mm512_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 a = _mm256_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 a = _mm_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] * ip2[i];
    }
}

static void
sse2_binary_scalar2_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 b = _mm512_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 c = _mm512_mul_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 b = _mm256_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 c = _mm256_mul_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 b = _mm_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 c = _mm_mul_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[0];
    }
}


#line 47
static void
sse2_binary_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 c = _mm512_div_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_load_ps(&ip1[i]);
                __m512 b = _mm512_load_ps(&ip2[i]);
                __m512 c = _mm512_div_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 c = _mm512_div_ps(a, a);
                _mm512_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m512 a = _mm512_loadu_ps(&ip1[i]);
                __m512 b = _mm512_loadu_ps(&ip2[i]);
                __m512 c = _mm512_div_ps(a, b);
                _mm512_store_ps(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 c = _mm256_div_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_load_ps(&ip1[i]);
                __m256 b = _mm256_load_ps(&ip2[i]);
                __m256 c = _mm256_div_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 c = _mm256_div_ps(a, a);
                _mm256_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m256 a = _mm256_loadu_ps(&ip1[i]);
                __m256 b = _mm256_loadu_ps(&ip2[i]);
                __m256 c = _mm256_div_ps(a, b);
                _mm256_store_ps(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 c = _mm_div_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_load_ps(&ip1[i]);
                __m128 b = _mm_load_ps(&ip2[i]);
                __m128 c = _mm_div_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 c = _mm_div_ps(a, a);
                _mm_store_ps(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_float, vector_size_bytes) {
                __m128 a = _mm_loadu_ps(&ip1[i]);
                __m128 b = _mm_loadu_ps(&ip2[i]);
                __m128 c = _mm_div_ps(a, b);
                _mm_store_ps(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[i];
    }
}

static void
sse2_binary_scalar1_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 a = _mm512_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_load_ps(&ip2[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 b = _mm512_loadu_ps(&ip2[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 a = _mm256_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_load_ps(&ip2[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 b = _mm256_loadu_ps(&ip2[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 a = _mm_set1_ps(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_load_ps(&ip2[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 b = _mm_loadu_ps(&ip2[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] / ip2[i];
    }
}

static void
sse2_binary_scalar2_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512 b = _mm512_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_load_ps(&ip1[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m512 a = _mm512_loadu_ps(&ip1[i]);
            __m512 c = _mm512_div_ps(a, b);
            _mm512_store_ps(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256 b = _mm256_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_load_ps(&ip1[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m256 a = _mm256_loadu_ps(&ip1[i]);
            __m256 c = _mm256_div_ps(a, b);
            _mm256_store_ps(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128 b = _mm_set1_ps(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_load_ps(&ip1[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, vector_size_bytes) {
            __m128 a = _mm_loadu_ps(&ip1[i]);
            __m128 c = _mm_div_ps(a, b);
            _mm_store_ps(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[0];
    }
}



#line 41
#line 47
static void
sse2_binary_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d c = _mm512_add_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d b = _mm512_load_pd(&ip2[i]);
                __m512d c = _mm512_add_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d c = _mm512_add_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d b = _mm512_loadu_pd(&ip2[i]);
                __m512d c = _mm512_add_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d c = _mm256_add_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d b = _mm256_load_pd(&ip2[i]);
                __m256d c = _mm256_add_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d c = _mm256_add_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d b = _mm256_loadu_pd(&ip2[i]);
                __m256d c = _mm256_add_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d c = _mm_add_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d b = _mm_load_pd(&ip2[i]);
                __m128d c = _mm_add_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d c = _mm_add_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d b = _mm_loadu_pd(&ip2[i]);
                __m128d c = _mm_add_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[i];
    }
}

static void
sse2_binary_scalar1_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d a = _mm512_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d a = _mm256_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d a = _mm_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] + ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] + ip2[i];
    }
}

static void
sse2_binary_scalar2_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d b = _mm512_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d c = _mm512_add_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d b = _mm256_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d c = _mm256_add_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d b = _mm_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] + ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d c = _mm_add_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[0];
    }
}


#line 47
static void
sse2_binary_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d c = _mm512_sub_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d b = _mm512_load_pd(&ip2[i]);
                __m512d c = _mm512_sub_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d c = _mm512_sub_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d b = _mm512_loadu_pd(&ip2[i]);
                __m512d c = _mm512_sub_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d c = _mm256_sub_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d b = _mm256_load_pd(&ip2[i]);
                __m256d c = _mm256_sub_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d c = _mm256_sub_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d b = _mm256_loadu_pd(&ip2[i]);
                __m256d c = _mm256_sub_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d c = _mm_sub_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d b = _mm_load_pd(&ip2[i]);
                __m128d c = _mm_sub_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d c = _mm_sub_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d b = _mm_loadu_pd(&ip2[i]);
                __m128d c = _mm_sub_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[i];
    }
}

static void
sse2_binary_scalar1_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d a = _mm512_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d a = _mm256_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d a = _mm_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] - ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] - ip2[i];
    }
}

static void
sse2_binary_scalar2_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d b = _mm512_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d c = _mm512_sub_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d b = _mm256_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d c = _mm256_sub_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d b = _mm_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] - ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d c = _mm_sub_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[0];
    }
}


#line 47
static void
sse2_binary_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d c = _mm512_mul_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d b = _mm512_load_pd(&ip2[i]);
                __m512d c = _mm512_mul_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d c = _mm512_mul_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d b = _mm512_loadu_pd(&ip2[i]);
                __m512d c = _mm512_mul_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d c = _mm256_mul_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d b = _mm256_load_pd(&ip2[i]);
                __m256d c = _mm256_mul_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d c = _mm256_mul_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d b = _mm256_loadu_pd(&ip2[i]);
                __m256d c = _mm256_mul_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d c = _mm_mul_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d b = _mm_load_pd(&ip2[i]);
                __m128d c = _mm_mul_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d c = _mm_mul_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d b = _mm_loadu_pd(&ip2[i]);
                __m128d c = _mm_mul_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[i];
    }
}

static void
sse2_binary_scalar1_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d a = _mm512_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d a = _mm256_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d a = _mm_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] * ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] * ip2[i];
    }
}

static void
sse2_binary_scalar2_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d b = _mm512_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d c = _mm512_mul_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d b = _mm256_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d c = _mm256_mul_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d b = _mm_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] * ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d c = _mm_mul_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[0];
    }
}


#line 47
static void
sse2_binary_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d c = _mm512_div_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_load_pd(&ip1[i]);
                __m512d b = _mm512_load_pd(&ip2[i]);
                __m512d c = _mm512_div_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d c = _mm512_div_pd(a, a);
                _mm512_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m512d a = _mm512_loadu_pd(&ip1[i]);
                __m512d b = _mm512_loadu_pd(&ip2[i]);
                __m512d c = _mm512_div_pd(a, b);
                _mm512_store_pd(&op[i], c);
            }
        }
    }
#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d c = _mm256_div_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_load_pd(&ip1[i]);
                __m256d b = _mm256_load_pd(&ip2[i]);
                __m256d c = _mm256_div_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d c = _mm256_div_pd(a, a);
                _mm256_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m256d a = _mm256_loadu_pd(&ip1[i]);
                __m256d b = _mm256_loadu_pd(&ip2[i]);
                __m256d c = _mm256_div_pd(a, b);
                _mm256_store_pd(&op[i], c);
            }
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d c = _mm_div_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_load_pd(&ip1[i]);
                __m128d b = _mm_load_pd(&ip2[i]);
                __m128d c = _mm_div_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d c = _mm_div_pd(a, a);
                _mm_store_pd(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(npy_double, vector_size_bytes) {
                __m128d a = _mm_loadu_pd(&ip1[i]);
                __m128d b = _mm_loadu_pd(&ip2[i]);
                __m128d c = _mm_div_pd(a, b);
                _mm_store_pd(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[i];
    }
}

static void
sse2_binary_scalar1_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d a = _mm512_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_load_pd(&ip2[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d b = _mm512_loadu_pd(&ip2[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }


#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d a = _mm256_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_load_pd(&ip2[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d b = _mm256_loadu_pd(&ip2[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d a = _mm_set1_pd(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[0] / ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_load_pd(&ip2[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d b = _mm_loadu_pd(&ip2[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] / ip2[i];
    }
}

static void
sse2_binary_scalar2_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
#ifdef NPY_HAVE_AVX512F
    const npy_intp vector_size_bytes = 64;
    const __m512d b = _mm512_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_load_pd(&ip1[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m512d a = _mm512_loadu_pd(&ip1[i]);
            __m512d c = _mm512_div_pd(a, b);
            _mm512_store_pd(&op[i], c);
        }
    }

#elif defined NPY_HAVE_AVX2
    const npy_intp vector_size_bytes = 32;
    const __m256d b = _mm256_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_load_pd(&ip1[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m256d a = _mm256_loadu_pd(&ip1[i]);
            __m256d c = _mm256_div_pd(a, b);
            _mm256_store_pd(&op[i], c);
        }
    }
#else
    const npy_intp vector_size_bytes = 16;
    const __m128d b = _mm_set1_pd(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, vector_size_bytes)
        op[i] = ip1[i] / ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_load_pd(&ip1[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, vector_size_bytes) {
            __m128d a = _mm_loadu_pd(&ip1[i]);
            __m128d c = _mm_div_pd(a, b);
            _mm_store_pd(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[0];
    }
}




#else // NPY_HAVE_SSE2

#line 369
#if NPY_SIMD
#line 376

static void
simd_binary_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] + ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 c = npyv_add_f32(a, a);
            npyv_store_f32(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 b = npyv_load_f32(&ip2[i]);
            npyv_f32 c = npyv_add_f32(a, b);
            npyv_store_f32(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[i];
    }
}

static void
simd_binary_scalar1_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v1 = npyv_setall_f32(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] + ip2[i];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v2 = npyv_load_f32(&ip2[i]);
        npyv_f32 v3 = npyv_add_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] + ip2[i];
    }
}

static void
simd_binary_scalar2_add_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v2 = npyv_setall_f32(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] + ip2[0];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v1 = npyv_load_f32(&ip1[i]);
        npyv_f32 v3 = npyv_add_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[0];
    }
}

#line 376

static void
simd_binary_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] - ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 c = npyv_sub_f32(a, a);
            npyv_store_f32(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 b = npyv_load_f32(&ip2[i]);
            npyv_f32 c = npyv_sub_f32(a, b);
            npyv_store_f32(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[i];
    }
}

static void
simd_binary_scalar1_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v1 = npyv_setall_f32(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] - ip2[i];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v2 = npyv_load_f32(&ip2[i]);
        npyv_f32 v3 = npyv_sub_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] - ip2[i];
    }
}

static void
simd_binary_scalar2_subtract_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v2 = npyv_setall_f32(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] - ip2[0];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v1 = npyv_load_f32(&ip1[i]);
        npyv_f32 v3 = npyv_sub_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[0];
    }
}

#line 376

static void
simd_binary_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] * ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 c = npyv_mul_f32(a, a);
            npyv_store_f32(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 b = npyv_load_f32(&ip2[i]);
            npyv_f32 c = npyv_mul_f32(a, b);
            npyv_store_f32(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[i];
    }
}

static void
simd_binary_scalar1_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v1 = npyv_setall_f32(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] * ip2[i];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v2 = npyv_load_f32(&ip2[i]);
        npyv_f32 v3 = npyv_mul_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] * ip2[i];
    }
}

static void
simd_binary_scalar2_multiply_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v2 = npyv_setall_f32(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] * ip2[0];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v1 = npyv_load_f32(&ip1[i]);
        npyv_f32 v3 = npyv_mul_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[0];
    }
}

#line 376

static void
simd_binary_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] / ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 c = npyv_div_f32(a, a);
            npyv_store_f32(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
            npyv_f32 a = npyv_load_f32(&ip1[i]);
            npyv_f32 b = npyv_load_f32(&ip2[i]);
            npyv_f32 c = npyv_div_f32(a, b);
            npyv_store_f32(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[i];
    }
}

static void
simd_binary_scalar1_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v1 = npyv_setall_f32(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] / ip2[i];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v2 = npyv_load_f32(&ip2[i]);
        npyv_f32 v3 = npyv_div_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] / ip2[i];
    }
}

static void
simd_binary_scalar2_divide_FLOAT(npy_float * op, npy_float * ip1, npy_float * ip2, npy_intp n)
{
    const npyv_f32 v2 = npyv_setall_f32(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_float, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] / ip2[0];
    }
    LOOP_BLOCKED(npy_float, NPY_SIMD_WIDTH) {
        npyv_f32 v1 = npyv_load_f32(&ip1[i]);
        npyv_f32 v3 = npyv_div_f32(v1, v2);
        npyv_store_f32(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[0];
    }
}

#endif /* NPY_SIMD */

#line 369
#if NPY_SIMD_F64
#line 376

static void
simd_binary_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] + ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 c = npyv_add_f64(a, a);
            npyv_store_f64(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 b = npyv_load_f64(&ip2[i]);
            npyv_f64 c = npyv_add_f64(a, b);
            npyv_store_f64(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[i];
    }
}

static void
simd_binary_scalar1_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v1 = npyv_setall_f64(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] + ip2[i];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v2 = npyv_load_f64(&ip2[i]);
        npyv_f64 v3 = npyv_add_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] + ip2[i];
    }
}

static void
simd_binary_scalar2_add_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v2 = npyv_setall_f64(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] + ip2[0];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v1 = npyv_load_f64(&ip1[i]);
        npyv_f64 v3 = npyv_add_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] + ip2[0];
    }
}

#line 376

static void
simd_binary_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] - ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 c = npyv_sub_f64(a, a);
            npyv_store_f64(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 b = npyv_load_f64(&ip2[i]);
            npyv_f64 c = npyv_sub_f64(a, b);
            npyv_store_f64(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[i];
    }
}

static void
simd_binary_scalar1_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v1 = npyv_setall_f64(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] - ip2[i];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v2 = npyv_load_f64(&ip2[i]);
        npyv_f64 v3 = npyv_sub_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] - ip2[i];
    }
}

static void
simd_binary_scalar2_subtract_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v2 = npyv_setall_f64(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] - ip2[0];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v1 = npyv_load_f64(&ip1[i]);
        npyv_f64 v3 = npyv_sub_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] - ip2[0];
    }
}

#line 376

static void
simd_binary_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] * ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 c = npyv_mul_f64(a, a);
            npyv_store_f64(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 b = npyv_load_f64(&ip2[i]);
            npyv_f64 c = npyv_mul_f64(a, b);
            npyv_store_f64(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[i];
    }
}

static void
simd_binary_scalar1_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v1 = npyv_setall_f64(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] * ip2[i];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v2 = npyv_load_f64(&ip2[i]);
        npyv_f64 v3 = npyv_mul_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] * ip2[i];
    }
}

static void
simd_binary_scalar2_multiply_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v2 = npyv_setall_f64(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] * ip2[0];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v1 = npyv_load_f64(&ip1[i]);
        npyv_f64 v3 = npyv_mul_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] * ip2[0];
    }
}

#line 376

static void
simd_binary_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] / ip2[i];
    }
    /* lots of specializations, to squeeze out max performance */
    if (ip1 == ip2) {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 c = npyv_div_f64(a, a);
            npyv_store_f64(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
            npyv_f64 a = npyv_load_f64(&ip1[i]);
            npyv_f64 b = npyv_load_f64(&ip2[i]);
            npyv_f64 c = npyv_div_f64(a, b);
            npyv_store_f64(&op[i], c);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[i];
    }
}

static void
simd_binary_scalar1_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v1 = npyv_setall_f64(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[0] / ip2[i];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v2 = npyv_load_f64(&ip2[i]);
        npyv_f64 v3 = npyv_div_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[0] / ip2[i];
    }
}

static void
simd_binary_scalar2_divide_DOUBLE(npy_double * op, npy_double * ip1, npy_double * ip2, npy_intp n)
{
    const npyv_f64 v2 = npyv_setall_f64(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, npy_double, NPY_SIMD_WIDTH) {
        op[i] = ip1[i] / ip2[0];
    }
    LOOP_BLOCKED(npy_double, NPY_SIMD_WIDTH) {
        npyv_f64 v1 = npyv_load_f64(&ip1[i]);
        npyv_f64 v3 = npyv_div_f64(v1, v2);
        npyv_store_f64(&op[i], v3);
    }
    LOOP_BLOCKED_END {
        op[i] = ip1[i] / ip2[0];
    }
}

#endif /* NPY_SIMD_F64 */

#endif // NPY_HAVE_SSE2

#line 449
#line 453
static NPY_INLINE int
run_binary_simd_add_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar1_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar2_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_add_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_subtract_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar1_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar2_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_subtract_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_multiply_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar1_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar2_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_multiply_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_divide_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar1_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_scalar2_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), vector_size_bytes)) {
        sse2_binary_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD
    npy_float * ip1 = (npy_float *)args[0];
    npy_float * ip2 = (npy_float *)args[1];
    npy_float * op = (npy_float *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_float), NPY_SIMD_WIDTH)) {
        simd_binary_divide_FLOAT(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}


#line 449
#line 453
static NPY_INLINE int
run_binary_simd_add_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar1_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar2_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD_F64
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_add_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_subtract_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar1_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar2_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD_F64
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_subtract_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_multiply_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar1_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar2_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD_F64
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_multiply_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_divide_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 1 && defined NPY_HAVE_SSE2
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar1_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_scalar2_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), vector_size_bytes)) {
        sse2_binary_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif NPY_SIMD_F64
    npy_double * ip1 = (npy_double *)args[0];
    npy_double * ip2 = (npy_double *)args[1];
    npy_double * op = (npy_double *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_double), NPY_SIMD_WIDTH)) {
        simd_binary_divide_DOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}


#line 449
#line 453
static NPY_INLINE int
run_binary_simd_add_LONGDOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 0 && defined NPY_HAVE_SSE2
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar1_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar2_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif 0
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_add_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_subtract_LONGDOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 0 && defined NPY_HAVE_SSE2
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar1_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar2_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif 0
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_subtract_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_multiply_LONGDOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 0 && defined NPY_HAVE_SSE2
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar1_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar2_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif 0
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_multiply_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

#line 453
static NPY_INLINE int
run_binary_simd_divide_LONGDOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if 0 && defined NPY_HAVE_SSE2
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
#if defined NPY_HAVE_AVX512F
    const npy_uintp vector_size_bytes = 64;
#elif defined NPY_HAVE_AVX2
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar1_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_scalar2_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), vector_size_bytes)) {
        sse2_binary_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#elif 0
    npy_longdouble * ip1 = (npy_longdouble *)args[0];
    npy_longdouble * ip2 = (npy_longdouble *)args[1];
    npy_longdouble * op = (npy_longdouble *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar1_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_scalar2_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longdouble), NPY_SIMD_WIDTH)) {
        simd_binary_divide_LONGDOUBLE(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}



/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 517
#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 1
        npy_float * iop1 = (npy_float *)args[0];
        npy_intp n = dimensions[0];

        *iop1 += FLOAT_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_float) {
            io1 += *(npy_float *)ip2;
        }
        *((npy_float *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_add_FLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1 = *(npy_float *)ip1;
            const npy_float in2 = *(npy_float *)ip2;
            *((npy_float *)op1) = in1 + in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_float * iop1 = (npy_float *)args[0];
        npy_intp n = dimensions[0];

        *iop1 -= FLOAT_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_float) {
            io1 -= *(npy_float *)ip2;
        }
        *((npy_float *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_subtract_FLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1 = *(npy_float *)ip1;
            const npy_float in2 = *(npy_float *)ip2;
            *((npy_float *)op1) = in1 - in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_float * iop1 = (npy_float *)args[0];
        npy_intp n = dimensions[0];

        *iop1 *= FLOAT_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_float) {
            io1 *= *(npy_float *)ip2;
        }
        *((npy_float *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_multiply_FLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1 = *(npy_float *)ip1;
            const npy_float in2 = *(npy_float *)ip2;
            *((npy_float *)op1) = in1 * in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_float * iop1 = (npy_float *)args[0];
        npy_intp n = dimensions[0];

        *iop1 /= FLOAT_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_float) {
            io1 /= *(npy_float *)ip2;
        }
        *((npy_float *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_divide_FLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1 = *(npy_float *)ip1;
            const npy_float in2 = *(npy_float *)ip2;
            *((npy_float *)op1) = in1 / in2;
        }
    }
}


#line 517
#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 1
        npy_double * iop1 = (npy_double *)args[0];
        npy_intp n = dimensions[0];

        *iop1 += DOUBLE_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_double) {
            io1 += *(npy_double *)ip2;
        }
        *((npy_double *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_add_DOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1 = *(npy_double *)ip1;
            const npy_double in2 = *(npy_double *)ip2;
            *((npy_double *)op1) = in1 + in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_double * iop1 = (npy_double *)args[0];
        npy_intp n = dimensions[0];

        *iop1 -= DOUBLE_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_double) {
            io1 -= *(npy_double *)ip2;
        }
        *((npy_double *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_subtract_DOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1 = *(npy_double *)ip1;
            const npy_double in2 = *(npy_double *)ip2;
            *((npy_double *)op1) = in1 - in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_double * iop1 = (npy_double *)args[0];
        npy_intp n = dimensions[0];

        *iop1 *= DOUBLE_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_double) {
            io1 *= *(npy_double *)ip2;
        }
        *((npy_double *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_multiply_DOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1 = *(npy_double *)ip1;
            const npy_double in2 = *(npy_double *)ip2;
            *((npy_double *)op1) = in1 * in2;
        }
    }
}

#line 523
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if 0
        npy_double * iop1 = (npy_double *)args[0];
        npy_intp n = dimensions[0];

        *iop1 /= DOUBLE_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(npy_double) {
            io1 /= *(npy_double *)ip2;
        }
        *((npy_double *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_divide_DOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1 = *(npy_double *)ip1;
            const npy_double in2 = *(npy_double *)ip2;
            *((npy_double *)op1) = in1 / in2;
        }
    }
}



//###############################################################################
//## Complex Single/Double precision
//###############################################################################
/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
    /**
     * For somehow MSVC commit aggressive optimization lead
     * to raises 'RuntimeWarning: invalid value encountered in multiply'
     *
     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
     * investigate about it while moving to NPYV.
     */
    #define AVX512F_NOMSVC
#endif

#ifdef AVX512F_NOMSVC
NPY_FINLINE __mmask16
avx512_get_full_load_mask_ps(void)
{
    return 0xFFFF;
}

NPY_FINLINE __mmask8
avx512_get_full_load_mask_pd(void)
{
    return 0xFF;
}
NPY_FINLINE __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}

NPY_FINLINE __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}

NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
    return (0x0001 << num_elem) - 0x0001;
}

NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
    return (0x01 << num_elem) - 0x01;
}
#line 616
NPY_FINLINE __m512
avx512_hadd_ps(const __m512 x)
{
    return _mm512_add_ps(x, _mm512_permute_ps(x, 0xb1));
}

NPY_FINLINE __m512
avx512_hsub_ps(const __m512 x)
{
    return _mm512_sub_ps(x, _mm512_permute_ps(x, 0xb1));
}
NPY_FINLINE __m512
avx512_cmul_ps(__m512 x1, __m512 x2)
{
    // x1 = r1, i1
    // x2 = r2, i2
    __m512 x3  = _mm512_permute_ps(x2, 0xb1);   // i2, r2
    __m512 x12 = _mm512_mul_ps(x1, x2);            // r1*r2, i1*i2
    __m512 x13 = _mm512_mul_ps(x1, x3);            // r1*i2, r2*i1
    __m512 outreal = avx512_hsub_ps(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
    __m512 outimg  = avx512_hadd_ps(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
    return _mm512_mask_blend_ps(0xAAAA, outreal, outimg);
}

#line 616
NPY_FINLINE __m512d
avx512_hadd_pd(const __m512d x)
{
    return _mm512_add_pd(x, _mm512_permute_pd(x, 0x55));
}

NPY_FINLINE __m512d
avx512_hsub_pd(const __m512d x)
{
    return _mm512_sub_pd(x, _mm512_permute_pd(x, 0x55));
}
NPY_FINLINE __m512d
avx512_cmul_pd(__m512d x1, __m512d x2)
{
    // x1 = r1, i1
    // x2 = r2, i2
    __m512d x3  = _mm512_permute_pd(x2, 0x55);   // i2, r2
    __m512d x12 = _mm512_mul_pd(x1, x2);            // r1*r2, i1*i2
    __m512d x13 = _mm512_mul_pd(x1, x3);            // r1*i2, r2*i1
    __m512d outreal = avx512_hsub_pd(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
    __m512d outimg  = avx512_hadd_pd(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
    return _mm512_mask_blend_pd(0xAA, outreal, outimg);
}

#endif

#line 656
#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_add_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_float* ip1 = (npy_float*) args[0];
    npy_float* ip2 = (npy_float*) args[1];
    npy_float* op  = (npy_float*) args[2];

    __mmask16 load_mask = avx512_get_full_load_mask_ps();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 16) {
            load_mask = avx512_get_partial_load_mask_ps(
                                    num_remaining_elements, 16);
        }
        __m512 x1, x2;
        x1 = avx512_masked_load_ps(load_mask, ip1);
        x2 = avx512_masked_load_ps(load_mask, ip2);

        __m512 out = _mm512_add_ps(x1, x2);

        _mm512_mask_storeu_ps(op, load_mask, out);

        ip1 += 16;
        ip2 += 16;
        op += 16;
        num_remaining_elements -= 16;
    }
}
#endif // AVX512F_NOMSVC

#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_subtract_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_float* ip1 = (npy_float*) args[0];
    npy_float* ip2 = (npy_float*) args[1];
    npy_float* op  = (npy_float*) args[2];

    __mmask16 load_mask = avx512_get_full_load_mask_ps();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 16) {
            load_mask = avx512_get_partial_load_mask_ps(
                                    num_remaining_elements, 16);
        }
        __m512 x1, x2;
        x1 = avx512_masked_load_ps(load_mask, ip1);
        x2 = avx512_masked_load_ps(load_mask, ip2);

        __m512 out = _mm512_sub_ps(x1, x2);

        _mm512_mask_storeu_ps(op, load_mask, out);

        ip1 += 16;
        ip2 += 16;
        op += 16;
        num_remaining_elements -= 16;
    }
}
#endif // AVX512F_NOMSVC

#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_multiply_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_float* ip1 = (npy_float*) args[0];
    npy_float* ip2 = (npy_float*) args[1];
    npy_float* op  = (npy_float*) args[2];

    __mmask16 load_mask = avx512_get_full_load_mask_ps();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 16) {
            load_mask = avx512_get_partial_load_mask_ps(
                                    num_remaining_elements, 16);
        }
        __m512 x1, x2;
        x1 = avx512_masked_load_ps(load_mask, ip1);
        x2 = avx512_masked_load_ps(load_mask, ip2);

        __m512 out = avx512_cmul_ps(x1, x2);

        _mm512_mask_storeu_ps(op, load_mask, out);

        ip1 += 16;
        ip2 += 16;
        op += 16;
        num_remaining_elements -= 16;
    }
}
#endif // AVX512F_NOMSVC


#line 656
#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_add_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_double* ip1 = (npy_double*) args[0];
    npy_double* ip2 = (npy_double*) args[1];
    npy_double* op  = (npy_double*) args[2];

    __mmask8 load_mask = avx512_get_full_load_mask_pd();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 8) {
            load_mask = avx512_get_partial_load_mask_pd(
                                    num_remaining_elements, 8);
        }
        __m512d x1, x2;
        x1 = avx512_masked_load_pd(load_mask, ip1);
        x2 = avx512_masked_load_pd(load_mask, ip2);

        __m512d out = _mm512_add_pd(x1, x2);

        _mm512_mask_storeu_pd(op, load_mask, out);

        ip1 += 8;
        ip2 += 8;
        op += 8;
        num_remaining_elements -= 8;
    }
}
#endif // AVX512F_NOMSVC

#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_subtract_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_double* ip1 = (npy_double*) args[0];
    npy_double* ip2 = (npy_double*) args[1];
    npy_double* op  = (npy_double*) args[2];

    __mmask8 load_mask = avx512_get_full_load_mask_pd();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 8) {
            load_mask = avx512_get_partial_load_mask_pd(
                                    num_remaining_elements, 8);
        }
        __m512d x1, x2;
        x1 = avx512_masked_load_pd(load_mask, ip1);
        x2 = avx512_masked_load_pd(load_mask, ip2);

        __m512d out = _mm512_sub_pd(x1, x2);

        _mm512_mask_storeu_pd(op, load_mask, out);

        ip1 += 8;
        ip2 += 8;
        op += 8;
        num_remaining_elements -= 8;
    }
}
#endif // AVX512F_NOMSVC

#line 660
#if defined AVX512F_NOMSVC
static NPY_INLINE void
AVX512F_multiply_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    npy_double* ip1 = (npy_double*) args[0];
    npy_double* ip2 = (npy_double*) args[1];
    npy_double* op  = (npy_double*) args[2];

    __mmask8 load_mask = avx512_get_full_load_mask_pd();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 8) {
            load_mask = avx512_get_partial_load_mask_pd(
                                    num_remaining_elements, 8);
        }
        __m512d x1, x2;
        x1 = avx512_masked_load_pd(load_mask, ip1);
        x2 = avx512_masked_load_pd(load_mask, ip2);

        __m512d out = avx512_cmul_pd(x1, x2);

        _mm512_mask_storeu_pd(op, load_mask, out);

        ip1 += 8;
        ip2 += 8;
        op += 8;
        num_remaining_elements -= 8;
    }
}
#endif // AVX512F_NOMSVC



#line 700
#line 703
static NPY_INLINE int
run_binary_avx512f_add_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(8, 64)) {
        AVX512F_add_CFLOAT(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

#line 703
static NPY_INLINE int
run_binary_avx512f_subtract_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(8, 64)) {
        AVX512F_subtract_CFLOAT(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

#line 703
static NPY_INLINE int
run_binary_avx512f_multiply_CFLOAT(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(8, 64)) {
        AVX512F_multiply_CFLOAT(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}


#line 700
#line 703
static NPY_INLINE int
run_binary_avx512f_add_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(16, 64)) {
        AVX512F_add_CDOUBLE(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

#line 703
static NPY_INLINE int
run_binary_avx512f_subtract_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(16, 64)) {
        AVX512F_subtract_CDOUBLE(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

#line 703
static NPY_INLINE int
run_binary_avx512f_multiply_CDOUBLE(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined AVX512F_NOMSVC
    if (IS_BINARY_STRIDE_ONE(16, 64)) {
        AVX512F_multiply_CDOUBLE(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}



/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 729
#line 735
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    // Parenthesis around 1 tells clang dead code is intentional
    if (IS_BINARY_REDUCE && (1)) {
        npy_intp n = dimensions[0];
        npy_float * or = ((npy_float *)args[0]);
        npy_float * oi = ((npy_float *)args[0]) + 1;
        npy_float rr, ri;

        CFLOAT_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or += rr;
        *oi += ri;
        return;
    }
    if (!run_binary_avx512f_add_CFLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1r = ((npy_float *)ip1)[0];
            const npy_float in1i = ((npy_float *)ip1)[1];
            const npy_float in2r = ((npy_float *)ip2)[0];
            const npy_float in2i = ((npy_float *)ip2)[1];
            ((npy_float *)op1)[0] = in1r + in2r;
            ((npy_float *)op1)[1] = in1i + in2i;
        }
    }
}

#line 735
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    // Parenthesis around 0 tells clang dead code is intentional
    if (IS_BINARY_REDUCE && (0)) {
        npy_intp n = dimensions[0];
        npy_float * or = ((npy_float *)args[0]);
        npy_float * oi = ((npy_float *)args[0]) + 1;
        npy_float rr, ri;

        CFLOAT_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or -= rr;
        *oi -= ri;
        return;
    }
    if (!run_binary_avx512f_subtract_CFLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1r = ((npy_float *)ip1)[0];
            const npy_float in1i = ((npy_float *)ip1)[1];
            const npy_float in2r = ((npy_float *)ip2)[0];
            const npy_float in2i = ((npy_float *)ip2)[1];
            ((npy_float *)op1)[0] = in1r - in2r;
            ((npy_float *)op1)[1] = in1i - in2i;
        }
    }
}


NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_binary_avx512f_multiply_CFLOAT(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_float in1r = ((npy_float *)ip1)[0];
            const npy_float in1i = ((npy_float *)ip1)[1];
            const npy_float in2r = ((npy_float *)ip2)[0];
            const npy_float in2i = ((npy_float *)ip2)[1];
            ((npy_float *)op1)[0] = in1r*in2r - in1i*in2i;
            ((npy_float *)op1)[1] = in1r*in2i + in1i*in2r;
        }
    }
}

#line 729
#line 735
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_add)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    // Parenthesis around 1 tells clang dead code is intentional
    if (IS_BINARY_REDUCE && (1)) {
        npy_intp n = dimensions[0];
        npy_double * or = ((npy_double *)args[0]);
        npy_double * oi = ((npy_double *)args[0]) + 1;
        npy_double rr, ri;

        CDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or += rr;
        *oi += ri;
        return;
    }
    if (!run_binary_avx512f_add_CDOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1r = ((npy_double *)ip1)[0];
            const npy_double in1i = ((npy_double *)ip1)[1];
            const npy_double in2r = ((npy_double *)ip2)[0];
            const npy_double in2i = ((npy_double *)ip2)[1];
            ((npy_double *)op1)[0] = in1r + in2r;
            ((npy_double *)op1)[1] = in1i + in2i;
        }
    }
}

#line 735
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_subtract)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    // Parenthesis around 0 tells clang dead code is intentional
    if (IS_BINARY_REDUCE && (0)) {
        npy_intp n = dimensions[0];
        npy_double * or = ((npy_double *)args[0]);
        npy_double * oi = ((npy_double *)args[0]) + 1;
        npy_double rr, ri;

        CDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
        *or -= rr;
        *oi -= ri;
        return;
    }
    if (!run_binary_avx512f_subtract_CDOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1r = ((npy_double *)ip1)[0];
            const npy_double in1i = ((npy_double *)ip1)[1];
            const npy_double in2r = ((npy_double *)ip2)[0];
            const npy_double in2i = ((npy_double *)ip2)[1];
            ((npy_double *)op1)[0] = in1r - in2r;
            ((npy_double *)op1)[1] = in1i - in2i;
        }
    }
}


NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_multiply)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (!run_binary_avx512f_multiply_CDOUBLE(args, dimensions, steps)) {
        BINARY_LOOP {
            const npy_double in1r = ((npy_double *)ip1)[0];
            const npy_double in1i = ((npy_double *)ip1)[1];
            const npy_double in2r = ((npy_double *)ip2)[0];
            const npy_double in2i = ((npy_double *)ip2)[1];
            ((npy_double *)op1)[0] = in1r*in2r - in1i*in2i;
            ((npy_double *)op1)[1] = in1r*in2i + in1i*in2r;
        }
    }
}


