#line 1 "numpy/core/src/umath/loops_modulo.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** baseline vsx4
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"


#define DIVIDEBYZERO_OVERFLOW_CHECK(x, y, min_val, signed) \
    (NPY_UNLIKELY(                                         \
        (signed)                                    ?      \
        ((y == 0) || ((x == min_val) && (y == -1))) :      \
        (y == 0))                                          \
    )

#define FLAG_IF_DIVIDEBYZERO(x) do {     \
    if (NPY_UNLIKELY(x == 0)) {          \
        npy_set_floatstatus_divbyzero(); \
    }                                    \
} while (0)


#if NPY_SIMD && defined(NPY_HAVE_VSX4)
typedef struct {
    npyv_u32x2 hi;
    npyv_u32x2 lo;
} vsx4_u32x4;

typedef struct {
    npyv_s32x2 hi;
    npyv_s32x2 lo;
} vsx4_s32x4;

// Converts 1 8-bit vector into 2 16-bit vectors
NPY_FINLINE npyv_s16x2
vsx4_expand_s16_s8(npyv_s8 data)
{
    npyv_s16x2 r;
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
    return r;
}

// Converts 1 16-bit vector into 2 32-bit vectors
NPY_FINLINE npyv_s32x2
vsx4_expand_s32_s16(npyv_s16 data)
{
    npyv_s32x2 r;
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
    return r;
}

#line 65
// Converts 1 8-bit vector into 4 32-bit vectors
NPY_FINLINE vsx4_u32x4
vsx4_expand_u32_u8(npyv_u8 data)
{
    vsx4_u32x4 r;
    npyv_u16x2 expand = npyv_expand_u16_u8(data);
    r.hi = npyv_expand_u32_u16(expand.val[0]);
    r.lo = npyv_expand_u32_u16(expand.val[1]);
    return r;
}

#line 79
/*
 * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division/modulo for data of 32 bits
 * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division/modulo instruction, and then, convert the result back to
 * npyv_u8.
 */
NPY_FINLINE npyv_u8
vsx4_div_u8(npyv_u8 a, npyv_u8 b)
{
    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
    vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b);
    npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_u16 hi = vec_pack(v1, v2);
    npyv_u16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_u8
vsx4_div_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand)
{
    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
    npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_u16 hi = vec_pack(v1, v2);
    npyv_u16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_u16
vsx4_div_u16(npyv_u16 a, npyv_u16 b)
{
    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

NPY_FINLINE npyv_u16
vsx4_div_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand)
{
    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_div_u32 vec_div
#define vsx4_div_u64 vec_div
#define vsx4_div_scalar_u32 vec_div
#define vsx4_div_scalar_u64 vec_div

#line 79
/*
 * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division/modulo for data of 32 bits
 * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division/modulo instruction, and then, convert the result back to
 * npyv_u8.
 */
NPY_FINLINE npyv_u8
vsx4_mod_u8(npyv_u8 a, npyv_u8 b)
{
    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
    vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b);
    npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_u16 hi = vec_pack(v1, v2);
    npyv_u16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_u8
vsx4_mod_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand)
{
    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
    npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_u16 hi = vec_pack(v1, v2);
    npyv_u16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_u16
vsx4_mod_u16(npyv_u16 a, npyv_u16 b)
{
    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
    npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
    npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

NPY_FINLINE npyv_u16
vsx4_mod_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand)
{
    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
    npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
    npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_mod_u32 vec_mod
#define vsx4_mod_u64 vec_mod
#define vsx4_mod_scalar_u32 vec_mod
#define vsx4_mod_scalar_u64 vec_mod


#line 65
// Converts 1 8-bit vector into 4 32-bit vectors
NPY_FINLINE vsx4_s32x4
vsx4_expand_s32_s8(npyv_s8 data)
{
    vsx4_s32x4 r;
    npyv_s16x2 expand = vsx4_expand_s16_s8(data);
    r.hi = vsx4_expand_s32_s16(expand.val[0]);
    r.lo = vsx4_expand_s32_s16(expand.val[1]);
    return r;
}

#line 79
/*
 * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division/modulo for data of 32 bits
 * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division/modulo instruction, and then, convert the result back to
 * npyv_u8.
 */
NPY_FINLINE npyv_s8
vsx4_div_s8(npyv_s8 a, npyv_s8 b)
{
    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
    vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b);
    npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_s16 hi = vec_pack(v1, v2);
    npyv_s16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_s8
vsx4_div_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand)
{
    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
    npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_s16 hi = vec_pack(v1, v2);
    npyv_s16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_s16
vsx4_div_s16(npyv_s16 a, npyv_s16 b)
{
    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
    npyv_s32x2 b_expand = vsx4_expand_s32_s16(b);
    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

NPY_FINLINE npyv_s16
vsx4_div_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand)
{
    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_div_s32 vec_div
#define vsx4_div_s64 vec_div
#define vsx4_div_scalar_s32 vec_div
#define vsx4_div_scalar_s64 vec_div

#line 79
/*
 * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division/modulo for data of 32 bits
 * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division/modulo instruction, and then, convert the result back to
 * npyv_u8.
 */
NPY_FINLINE npyv_s8
vsx4_mod_s8(npyv_s8 a, npyv_s8 b)
{
    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
    vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b);
    npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_s16 hi = vec_pack(v1, v2);
    npyv_s16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_s8
vsx4_mod_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand)
{
    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
    npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_s16 hi = vec_pack(v1, v2);
    npyv_s16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_s16
vsx4_mod_s16(npyv_s16 a, npyv_s16 b)
{
    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
    npyv_s32x2 b_expand = vsx4_expand_s32_s16(b);
    npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
    npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

NPY_FINLINE npyv_s16
vsx4_mod_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand)
{
    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
    npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
    npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_mod_s32 vec_mod
#define vsx4_mod_s64 vec_mod
#define vsx4_mod_scalar_s32 vec_mod
#define vsx4_mod_scalar_s64 vec_mod



#line 146
// Generates the divisor for the division/modulo operations
NPY_FINLINE vsx4_u32x4
vsx4_divisor_u8(const npyv_u8 vscalar)
{
    return vsx4_expand_u32_u8(vscalar);
}

#line 146
// Generates the divisor for the division/modulo operations
NPY_FINLINE npyv_u32x2
vsx4_divisor_u16(const npyv_u16 vscalar)
{
    return npyv_expand_u32_u16(vscalar);
}

#line 146
// Generates the divisor for the division/modulo operations
NPY_FINLINE vsx4_s32x4
vsx4_divisor_s8(const npyv_s8 vscalar)
{
    return vsx4_expand_s32_s8(vscalar);
}

#line 146
// Generates the divisor for the division/modulo operations
NPY_FINLINE npyv_s32x2
vsx4_divisor_s16(const npyv_s16 vscalar)
{
    return vsx4_expand_s32_s16(vscalar);
}


#line 157
NPY_FINLINE npyv_u32
vsx4_divisor_u32(const npyv_u32 vscalar)
{
    return vscalar;
}

#line 157
NPY_FINLINE npyv_u64
vsx4_divisor_u64(const npyv_u64 vscalar)
{
    return vscalar;
}

#line 157
NPY_FINLINE npyv_s32
vsx4_divisor_s32(const npyv_s32 vscalar)
{
    return vscalar;
}

#line 157
NPY_FINLINE npyv_s64
vsx4_divisor_s64(const npyv_s64 vscalar)
{
    return vscalar;
}


#line 170
#line 174
static NPY_INLINE void
vsx4_simd_fmod_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero    = npyv_zero_u8();
    const int vstep           = npyv_nlanes_u8;
#if 0 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u8 a        = npyv_load_u8(src1);
        npyv_u8 b        = npyv_load_u8(src2);
        npyv_u8 quo      = vsx4_div_u8(a, b);
        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u8(bzero, vzero, rem);
                       warn = npyv_or_u8(bzero, warn);
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 b = npyv_load_u8(src2);
        npyv_u8 c = vsx4_mod_u8(a, b);
        npyv_store_u8(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
    const int vstep            = npyv_nlanes_u8;
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
#if 0 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u8 a   = npyv_load_u8(src1);
        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
        npyv_store_u8(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_remainder_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero    = npyv_zero_u8();
    const int vstep           = npyv_nlanes_u8;
#if 1 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u8 a        = npyv_load_u8(src1);
        npyv_u8 b        = npyv_load_u8(src2);
        npyv_u8 quo      = vsx4_div_u8(a, b);
        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u8(bzero, vzero, rem);
                       warn = npyv_or_u8(bzero, warn);
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 b = npyv_load_u8(src2);
        npyv_u8 c = vsx4_mod_u8(a, b);
        npyv_store_u8(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
    const int vstep            = npyv_nlanes_u8;
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
#if 1 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u8 a   = npyv_load_u8(src1);
        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
        npyv_store_u8(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_divmod_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
    const npyv_u8 vzero    = npyv_zero_u8();
    const int vstep           = npyv_nlanes_u8;
#if 2 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u8 a        = npyv_load_u8(src1);
        npyv_u8 b        = npyv_load_u8(src2);
        npyv_u8 quo      = vsx4_div_u8(a, b);
        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u8(bzero, vzero, rem);
                       warn = npyv_or_u8(bzero, warn);
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 b = npyv_load_u8(src2);
        npyv_u8 c = vsx4_mod_u8(a, b);
        npyv_store_u8(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        const npyv_lanetype_u8 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_u8(char **args, npy_intp len)
{
    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
    const int vstep            = npyv_nlanes_u8;
    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
#if 2 == 2 /* divmod */
    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u8 a   = npyv_load_u8(src1);
        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
        npyv_store_u8(dst1, quo);
        npyv_store_u8(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u8 a = npyv_load_u8(src1);
        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
        npyv_store_u8(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u8 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}


#line 170
#line 174
static NPY_INLINE void
vsx4_simd_fmod_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
    const npyv_u16 vzero    = npyv_zero_u16();
    const int vstep           = npyv_nlanes_u16;
#if 0 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u16 a        = npyv_load_u16(src1);
        npyv_u16 b        = npyv_load_u16(src2);
        npyv_u16 quo      = vsx4_div_u16(a, b);
        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u16(bzero, vzero, rem);
                       warn = npyv_or_u16(bzero, warn);
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 b = npyv_load_u16(src2);
        npyv_u16 c = vsx4_mod_u16(a, b);
        npyv_store_u16(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
    const int vstep            = npyv_nlanes_u16;
    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
#if 0 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u16 a   = npyv_load_u16(src1);
        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
        npyv_store_u16(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_remainder_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
    const npyv_u16 vzero    = npyv_zero_u16();
    const int vstep           = npyv_nlanes_u16;
#if 1 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u16 a        = npyv_load_u16(src1);
        npyv_u16 b        = npyv_load_u16(src2);
        npyv_u16 quo      = vsx4_div_u16(a, b);
        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u16(bzero, vzero, rem);
                       warn = npyv_or_u16(bzero, warn);
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 b = npyv_load_u16(src2);
        npyv_u16 c = vsx4_mod_u16(a, b);
        npyv_store_u16(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
    const int vstep            = npyv_nlanes_u16;
    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
#if 1 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u16 a   = npyv_load_u16(src1);
        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
        npyv_store_u16(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_divmod_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
    const npyv_u16 vzero    = npyv_zero_u16();
    const int vstep           = npyv_nlanes_u16;
#if 2 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u16 a        = npyv_load_u16(src1);
        npyv_u16 b        = npyv_load_u16(src2);
        npyv_u16 quo      = vsx4_div_u16(a, b);
        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u16(bzero, vzero, rem);
                       warn = npyv_or_u16(bzero, warn);
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 b = npyv_load_u16(src2);
        npyv_u16 c = vsx4_mod_u16(a, b);
        npyv_store_u16(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        const npyv_lanetype_u16 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_u16(char **args, npy_intp len)
{
    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
    const int vstep            = npyv_nlanes_u16;
    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
#if 2 == 2 /* divmod */
    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u16 a   = npyv_load_u16(src1);
        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
        npyv_store_u16(dst1, quo);
        npyv_store_u16(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u16 a = npyv_load_u16(src1);
        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
        npyv_store_u16(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u16 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}


#line 170
#line 174
static NPY_INLINE void
vsx4_simd_fmod_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
    const npyv_u32 vzero    = npyv_zero_u32();
    const int vstep           = npyv_nlanes_u32;
#if 0 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u32 a        = npyv_load_u32(src1);
        npyv_u32 b        = npyv_load_u32(src2);
        npyv_u32 quo      = vsx4_div_u32(a, b);
        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u32(bzero, vzero, rem);
                       warn = npyv_or_u32(bzero, warn);
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 b = npyv_load_u32(src2);
        npyv_u32 c = vsx4_mod_u32(a, b);
        npyv_store_u32(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
    const int vstep            = npyv_nlanes_u32;
    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
#if 0 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u32 a   = npyv_load_u32(src1);
        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
        npyv_store_u32(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_remainder_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
    const npyv_u32 vzero    = npyv_zero_u32();
    const int vstep           = npyv_nlanes_u32;
#if 1 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u32 a        = npyv_load_u32(src1);
        npyv_u32 b        = npyv_load_u32(src2);
        npyv_u32 quo      = vsx4_div_u32(a, b);
        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u32(bzero, vzero, rem);
                       warn = npyv_or_u32(bzero, warn);
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 b = npyv_load_u32(src2);
        npyv_u32 c = vsx4_mod_u32(a, b);
        npyv_store_u32(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
    const int vstep            = npyv_nlanes_u32;
    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
#if 1 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u32 a   = npyv_load_u32(src1);
        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
        npyv_store_u32(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_divmod_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
    const npyv_u32 vzero    = npyv_zero_u32();
    const int vstep           = npyv_nlanes_u32;
#if 2 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u32 a        = npyv_load_u32(src1);
        npyv_u32 b        = npyv_load_u32(src2);
        npyv_u32 quo      = vsx4_div_u32(a, b);
        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u32(bzero, vzero, rem);
                       warn = npyv_or_u32(bzero, warn);
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 b = npyv_load_u32(src2);
        npyv_u32 c = vsx4_mod_u32(a, b);
        npyv_store_u32(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        const npyv_lanetype_u32 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_u32(char **args, npy_intp len)
{
    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
    const int vstep            = npyv_nlanes_u32;
    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
#if 2 == 2 /* divmod */
    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u32 a   = npyv_load_u32(src1);
        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
        npyv_store_u32(dst1, quo);
        npyv_store_u32(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u32 a = npyv_load_u32(src1);
        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
        npyv_store_u32(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u32 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}


#line 170
#line 174
static NPY_INLINE void
vsx4_simd_fmod_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
    const npyv_u64 vzero    = npyv_zero_u64();
    const int vstep           = npyv_nlanes_u64;
#if 0 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u64 a        = npyv_load_u64(src1);
        npyv_u64 b        = npyv_load_u64(src2);
        npyv_u64 quo      = vsx4_div_u64(a, b);
        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u64(bzero, vzero, rem);
                       warn = npyv_or_u64(bzero, warn);
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 b = npyv_load_u64(src2);
        npyv_u64 c = vsx4_mod_u64(a, b);
        npyv_store_u64(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
    const int vstep            = npyv_nlanes_u64;
    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
#if 0 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u64 a   = npyv_load_u64(src1);
        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
        npyv_store_u64(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_remainder_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
    const npyv_u64 vzero    = npyv_zero_u64();
    const int vstep           = npyv_nlanes_u64;
#if 1 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u64 a        = npyv_load_u64(src1);
        npyv_u64 b        = npyv_load_u64(src2);
        npyv_u64 quo      = vsx4_div_u64(a, b);
        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u64(bzero, vzero, rem);
                       warn = npyv_or_u64(bzero, warn);
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 b = npyv_load_u64(src2);
        npyv_u64 c = vsx4_mod_u64(a, b);
        npyv_store_u64(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
    const int vstep            = npyv_nlanes_u64;
    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
#if 1 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u64 a   = npyv_load_u64(src1);
        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
        npyv_store_u64(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}

#line 174
static NPY_INLINE void
vsx4_simd_divmod_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
    const npyv_u64 vzero    = npyv_zero_u64();
    const int vstep           = npyv_nlanes_u64;
#if 2 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_u64 a        = npyv_load_u64(src1);
        npyv_u64 b        = npyv_load_u64(src2);
        npyv_u64 quo      = vsx4_div_u64(a, b);
        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_u64(bzero, vzero, rem);
                       warn = npyv_or_u64(bzero, warn);
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 b = npyv_load_u64(src2);
        npyv_u64 c = vsx4_mod_u64(a, b);
        npyv_store_u64(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        const npyv_lanetype_u64 b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_u64(char **args, npy_intp len)
{
    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
    const int vstep            = npyv_nlanes_u64;
    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
#if 2 == 2 /* divmod */
    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_u64 a   = npyv_load_u64(src1);
        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
        npyv_store_u64(dst1, quo);
        npyv_store_u64(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_u64 a = npyv_load_u64(src1);
        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
        npyv_store_u64(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_u64 a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}



#line 291
#line 295
static NPY_INLINE void
vsx4_simd_fmod_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vzero    = npyv_zero_s8();
    const int vstep           = npyv_nlanes_s8;
#if 0 == 2 /* divmod */
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
        npyv_s8 b = npyv_load_s8(src2);
#if 0 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_s8(a, b);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_s8(a, b);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                warn_zero = npyv_or_s8(bzero, warn_zero);
               warn_overflow = npyv_or_s8(overflow, warn_overflow);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
                      quo = npyv_add_s8(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s8(bzero, vzero, quo);
                      rem = npyv_select_s8(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s8(overflow, vmin, quo);
                      rem = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT8;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 0 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
    const int vstep            = npyv_nlanes_s8;
#if 0 >= 1 /* remainder and divmod */
    const npyv_s8 vzero     = npyv_zero_s8();
    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
#endif
#if 0 == 2 /* divmod */
    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
#if 0 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT8 && b == -1)
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                        warn = npyv_or_s8(overflow, warn);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
        quo               = npyv_add_s8(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s8(overflow, vmin, quo);
        rem               = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT8;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        *dst1 = a % scalar;
#if 0 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_remainder_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vzero    = npyv_zero_s8();
    const int vstep           = npyv_nlanes_s8;
#if 1 == 2 /* divmod */
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
        npyv_s8 b = npyv_load_s8(src2);
#if 1 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_s8(a, b);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_s8(a, b);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                warn_zero = npyv_or_s8(bzero, warn_zero);
               warn_overflow = npyv_or_s8(overflow, warn_overflow);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
                      quo = npyv_add_s8(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s8(bzero, vzero, quo);
                      rem = npyv_select_s8(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s8(overflow, vmin, quo);
                      rem = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT8;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 1 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
    const int vstep            = npyv_nlanes_s8;
#if 1 >= 1 /* remainder and divmod */
    const npyv_s8 vzero     = npyv_zero_s8();
    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
#endif
#if 1 == 2 /* divmod */
    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
#if 1 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT8 && b == -1)
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                        warn = npyv_or_s8(overflow, warn);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
        quo               = npyv_add_s8(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s8(overflow, vmin, quo);
        rem               = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT8;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        *dst1 = a % scalar;
#if 1 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_divmod_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vzero    = npyv_zero_s8();
    const int vstep           = npyv_nlanes_s8;
#if 2 == 2 /* divmod */
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
        npyv_s8 b = npyv_load_s8(src2);
#if 2 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_s8(a, b);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_s8(a, b);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                warn_zero = npyv_or_s8(bzero, warn_zero);
               warn_overflow = npyv_or_s8(overflow, warn_overflow);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
                      quo = npyv_add_s8(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s8(bzero, vzero, quo);
                      rem = npyv_select_s8(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s8(overflow, vmin, quo);
                      rem = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT8;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        const npyv_lanetype_s8 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 2 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_s8(char **args, npy_intp len)
{
    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
    const int vstep            = npyv_nlanes_s8;
#if 2 >= 1 /* remainder and divmod */
    const npyv_s8 vzero     = npyv_zero_s8();
    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
#endif
#if 2 == 2 /* divmod */
    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
    const npyv_s8 vneg_one = npyv_setall_s8(-1);
    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s8 a = npyv_load_s8(src1);
#if 2 <= 1 /* fmod and remainder */
        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
#else /* divmod */
        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT8 && b == -1)
        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
                        warn = npyv_or_s8(overflow, warn);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
                           rem = npyv_add_s8(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
        quo               = npyv_add_s8(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s8(overflow, vmin, quo);
        rem               = npyv_select_s8(overflow, vzero, rem);
        npyv_store_s8(dst1, quo);
        npyv_store_s8(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s8(dst1, rem);
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s8 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT8;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s8 a = *src1;
        *dst1 = a % scalar;
#if 2 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}


#line 291
#line 295
static NPY_INLINE void
vsx4_simd_fmod_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vzero    = npyv_zero_s16();
    const int vstep           = npyv_nlanes_s16;
#if 0 == 2 /* divmod */
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
        npyv_s16 b = npyv_load_s16(src2);
#if 0 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_s16(a, b);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_s16(a, b);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                warn_zero = npyv_or_s16(bzero, warn_zero);
               warn_overflow = npyv_or_s16(overflow, warn_overflow);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
                      quo = npyv_add_s16(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s16(bzero, vzero, quo);
                      rem = npyv_select_s16(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s16(overflow, vmin, quo);
                      rem = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT16;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 0 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
    const int vstep            = npyv_nlanes_s16;
#if 0 >= 1 /* remainder and divmod */
    const npyv_s16 vzero     = npyv_zero_s16();
    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
#endif
#if 0 == 2 /* divmod */
    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
#if 0 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT16 && b == -1)
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                        warn = npyv_or_s16(overflow, warn);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
        quo               = npyv_add_s16(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s16(overflow, vmin, quo);
        rem               = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT16;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        *dst1 = a % scalar;
#if 0 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_remainder_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vzero    = npyv_zero_s16();
    const int vstep           = npyv_nlanes_s16;
#if 1 == 2 /* divmod */
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
        npyv_s16 b = npyv_load_s16(src2);
#if 1 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_s16(a, b);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_s16(a, b);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                warn_zero = npyv_or_s16(bzero, warn_zero);
               warn_overflow = npyv_or_s16(overflow, warn_overflow);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
                      quo = npyv_add_s16(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s16(bzero, vzero, quo);
                      rem = npyv_select_s16(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s16(overflow, vmin, quo);
                      rem = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT16;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 1 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
    const int vstep            = npyv_nlanes_s16;
#if 1 >= 1 /* remainder and divmod */
    const npyv_s16 vzero     = npyv_zero_s16();
    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
#endif
#if 1 == 2 /* divmod */
    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
#if 1 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT16 && b == -1)
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                        warn = npyv_or_s16(overflow, warn);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
        quo               = npyv_add_s16(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s16(overflow, vmin, quo);
        rem               = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT16;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        *dst1 = a % scalar;
#if 1 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_divmod_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vzero    = npyv_zero_s16();
    const int vstep           = npyv_nlanes_s16;
#if 2 == 2 /* divmod */
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
        npyv_s16 b = npyv_load_s16(src2);
#if 2 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_s16(a, b);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_s16(a, b);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                warn_zero = npyv_or_s16(bzero, warn_zero);
               warn_overflow = npyv_or_s16(overflow, warn_overflow);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
                      quo = npyv_add_s16(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s16(bzero, vzero, quo);
                      rem = npyv_select_s16(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s16(overflow, vmin, quo);
                      rem = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT16;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        const npyv_lanetype_s16 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 2 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_s16(char **args, npy_intp len)
{
    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
    const int vstep            = npyv_nlanes_s16;
#if 2 >= 1 /* remainder and divmod */
    const npyv_s16 vzero     = npyv_zero_s16();
    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
#endif
#if 2 == 2 /* divmod */
    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
    const npyv_s16 vneg_one = npyv_setall_s16(-1);
    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s16 a = npyv_load_s16(src1);
#if 2 <= 1 /* fmod and remainder */
        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
#else /* divmod */
        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT16 && b == -1)
        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
                        warn = npyv_or_s16(overflow, warn);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
                           rem = npyv_add_s16(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
        quo               = npyv_add_s16(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s16(overflow, vmin, quo);
        rem               = npyv_select_s16(overflow, vzero, rem);
        npyv_store_s16(dst1, quo);
        npyv_store_s16(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s16(dst1, rem);
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s16 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT16;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s16 a = *src1;
        *dst1 = a % scalar;
#if 2 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}


#line 291
#line 295
static NPY_INLINE void
vsx4_simd_fmod_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vzero    = npyv_zero_s32();
    const int vstep           = npyv_nlanes_s32;
#if 0 == 2 /* divmod */
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
        npyv_s32 b = npyv_load_s32(src2);
#if 0 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_s32(a, b);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_s32(a, b);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                warn_zero = npyv_or_s32(bzero, warn_zero);
               warn_overflow = npyv_or_s32(overflow, warn_overflow);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
                      quo = npyv_add_s32(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s32(bzero, vzero, quo);
                      rem = npyv_select_s32(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s32(overflow, vmin, quo);
                      rem = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT32;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 0 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
    const int vstep            = npyv_nlanes_s32;
#if 0 >= 1 /* remainder and divmod */
    const npyv_s32 vzero     = npyv_zero_s32();
    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
#endif
#if 0 == 2 /* divmod */
    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
#if 0 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT32 && b == -1)
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                        warn = npyv_or_s32(overflow, warn);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
        quo               = npyv_add_s32(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s32(overflow, vmin, quo);
        rem               = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT32;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        *dst1 = a % scalar;
#if 0 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_remainder_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vzero    = npyv_zero_s32();
    const int vstep           = npyv_nlanes_s32;
#if 1 == 2 /* divmod */
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
        npyv_s32 b = npyv_load_s32(src2);
#if 1 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_s32(a, b);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_s32(a, b);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                warn_zero = npyv_or_s32(bzero, warn_zero);
               warn_overflow = npyv_or_s32(overflow, warn_overflow);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
                      quo = npyv_add_s32(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s32(bzero, vzero, quo);
                      rem = npyv_select_s32(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s32(overflow, vmin, quo);
                      rem = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT32;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 1 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
    const int vstep            = npyv_nlanes_s32;
#if 1 >= 1 /* remainder and divmod */
    const npyv_s32 vzero     = npyv_zero_s32();
    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
#endif
#if 1 == 2 /* divmod */
    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
#if 1 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT32 && b == -1)
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                        warn = npyv_or_s32(overflow, warn);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
        quo               = npyv_add_s32(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s32(overflow, vmin, quo);
        rem               = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT32;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        *dst1 = a % scalar;
#if 1 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_divmod_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vzero    = npyv_zero_s32();
    const int vstep           = npyv_nlanes_s32;
#if 2 == 2 /* divmod */
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
        npyv_s32 b = npyv_load_s32(src2);
#if 2 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_s32(a, b);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_s32(a, b);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                warn_zero = npyv_or_s32(bzero, warn_zero);
               warn_overflow = npyv_or_s32(overflow, warn_overflow);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
                      quo = npyv_add_s32(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s32(bzero, vzero, quo);
                      rem = npyv_select_s32(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s32(overflow, vmin, quo);
                      rem = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT32;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        const npyv_lanetype_s32 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 2 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_s32(char **args, npy_intp len)
{
    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
    const int vstep            = npyv_nlanes_s32;
#if 2 >= 1 /* remainder and divmod */
    const npyv_s32 vzero     = npyv_zero_s32();
    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
#endif
#if 2 == 2 /* divmod */
    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
    const npyv_s32 vneg_one = npyv_setall_s32(-1);
    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s32 a = npyv_load_s32(src1);
#if 2 <= 1 /* fmod and remainder */
        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
#else /* divmod */
        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT32 && b == -1)
        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
                        warn = npyv_or_s32(overflow, warn);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
                           rem = npyv_add_s32(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
        quo               = npyv_add_s32(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s32(overflow, vmin, quo);
        rem               = npyv_select_s32(overflow, vzero, rem);
        npyv_store_s32(dst1, quo);
        npyv_store_s32(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s32(dst1, rem);
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s32 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT32;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s32 a = *src1;
        *dst1 = a % scalar;
#if 2 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}


#line 291
#line 295
static NPY_INLINE void
vsx4_simd_fmod_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vzero    = npyv_zero_s64();
    const int vstep           = npyv_nlanes_s64;
#if 0 == 2 /* divmod */
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
        npyv_s64 b = npyv_load_s64(src2);
#if 0 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_s64(a, b);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_s64(a, b);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                warn_zero = npyv_or_s64(bzero, warn_zero);
               warn_overflow = npyv_or_s64(overflow, warn_overflow);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
                      quo = npyv_add_s64(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s64(bzero, vzero, quo);
                      rem = npyv_select_s64(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s64(overflow, vmin, quo);
                      rem = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT64;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 0 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_fmod_by_scalar_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
    const int vstep            = npyv_nlanes_s64;
#if 0 >= 1 /* remainder and divmod */
    const npyv_s64 vzero     = npyv_zero_s64();
    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
#endif
#if 0 == 2 /* divmod */
    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
#if 0 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT64 && b == -1)
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                        warn = npyv_or_s64(overflow, warn);
#endif
#if 0 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 0 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
        quo               = npyv_add_s64(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s64(overflow, vmin, quo);
        rem               = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
#endif
    }

#if 0 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT64;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        *dst1 = a % scalar;
#if 0 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_remainder_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vzero    = npyv_zero_s64();
    const int vstep           = npyv_nlanes_s64;
#if 1 == 2 /* divmod */
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
        npyv_s64 b = npyv_load_s64(src2);
#if 1 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_s64(a, b);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_s64(a, b);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                warn_zero = npyv_or_s64(bzero, warn_zero);
               warn_overflow = npyv_or_s64(overflow, warn_overflow);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
                      quo = npyv_add_s64(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s64(bzero, vzero, quo);
                      rem = npyv_select_s64(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s64(overflow, vmin, quo);
                      rem = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT64;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 1 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_remainder_by_scalar_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
    const int vstep            = npyv_nlanes_s64;
#if 1 >= 1 /* remainder and divmod */
    const npyv_s64 vzero     = npyv_zero_s64();
    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
#endif
#if 1 == 2 /* divmod */
    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
#if 1 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT64 && b == -1)
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                        warn = npyv_or_s64(overflow, warn);
#endif
#if 1 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 1 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
        quo               = npyv_add_s64(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s64(overflow, vmin, quo);
        rem               = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
#endif
    }

#if 1 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT64;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        *dst1 = a % scalar;
#if 1 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}

#line 295
static NPY_INLINE void
vsx4_simd_divmod_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vzero    = npyv_zero_s64();
    const int vstep           = npyv_nlanes_s64;
#if 2 == 2 /* divmod */
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
        npyv_s64 b = npyv_load_s64(src2);
#if 2 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_s64(a, b);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_s64(a, b);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                warn_zero = npyv_or_s64(bzero, warn_zero);
               warn_overflow = npyv_or_s64(overflow, warn_overflow);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
                      quo = npyv_add_s64(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_s64(bzero, vzero, quo);
                      rem = npyv_select_s64(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_s64(overflow, vmin, quo);
                      rem = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT64;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        const npyv_lanetype_s64 b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if 2 == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static NPY_INLINE void
vsx4_simd_divmod_by_scalar_contig_s64(char **args, npy_intp len)
{
    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
    const int vstep            = npyv_nlanes_s64;
#if 2 >= 1 /* remainder and divmod */
    const npyv_s64 vzero     = npyv_zero_s64();
    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
#endif
#if 2 == 2 /* divmod */
    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
    const npyv_s64 vneg_one = npyv_setall_s64(-1);
    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_s64 a = npyv_load_s64(src1);
#if 2 <= 1 /* fmod and remainder */
        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
#else /* divmod */
        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT64 && b == -1)
        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
                        warn = npyv_or_s64(overflow, warn);
#endif
#if 2 >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
                           rem = npyv_add_s64(rem, to_add);
#endif
#if 2 == 2 /* divmod */
        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
        quo               = npyv_add_s64(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_s64(overflow, vmin, quo);
        rem               = npyv_select_s64(overflow, vzero, rem);
        npyv_store_s64(dst1, quo);
        npyv_store_s64(dst2, rem);
#else /* fmod and remainder */
        npyv_store_s64(dst1, rem);
#endif
    }

#if 2 == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_s64 a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT64;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_s64 a = *src1;
        *dst1 = a % scalar;
#if 2 == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}


#endif // NPY_SIMD && defined(NPY_HAVE_VSX4)

/*****************************************************************************
 ** Defining ufunc inner functions
 *****************************************************************************/

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_BYTE == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
             (*(npy_ubyte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ubyte *)op1) = 0;
        } else{
            *((npy_ubyte *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
             (*(npy_ubyte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ubyte *)op1) = 0;
        } else{
#if 0
            /* handle mixed case the way Python does */
            const npy_ubyte rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ubyte *)op1) = rem;
            }
            else {
                *((npy_ubyte *)op1) = rem + in2;
            }
#else
            *((npy_ubyte *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
             (*(npy_ubyte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 0
    BINARY_LOOP_TWO_OUT {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_ubyte *)op1) = 0;
                *((npy_ubyte *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_ubyte *)op1) = NPY_MIN_UBYTE;
                *((npy_ubyte *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_ubyte quo = in1 / in2;
            const npy_ubyte rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ubyte *)op1) = quo;
                *((npy_ubyte *)op2) = rem;
            }
            else {
                *((npy_ubyte *)op1) = quo - 1;
                *((npy_ubyte *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_ubyte in1 = *(npy_ubyte *)ip1;
        const npy_ubyte in2 = *(npy_ubyte *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_ubyte *)op1) = 0;
            *((npy_ubyte *)op2) = 0;
        }
        else {
            *((npy_ubyte *)op1)= in1/in2;
            *((npy_ubyte *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_SHORT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
             (*(npy_ushort *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ushort *)op1) = 0;
        } else{
            *((npy_ushort *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
             (*(npy_ushort *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ushort *)op1) = 0;
        } else{
#if 0
            /* handle mixed case the way Python does */
            const npy_ushort rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ushort *)op1) = rem;
            }
            else {
                *((npy_ushort *)op1) = rem + in2;
            }
#else
            *((npy_ushort *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
             (*(npy_ushort *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 0
    BINARY_LOOP_TWO_OUT {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_ushort *)op1) = 0;
                *((npy_ushort *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_ushort *)op1) = NPY_MIN_USHORT;
                *((npy_ushort *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_ushort quo = in1 / in2;
            const npy_ushort rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ushort *)op1) = quo;
                *((npy_ushort *)op2) = rem;
            }
            else {
                *((npy_ushort *)op1) = quo - 1;
                *((npy_ushort *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_ushort in1 = *(npy_ushort *)ip1;
        const npy_ushort in2 = *(npy_ushort *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_ushort *)op1) = 0;
            *((npy_ushort *)op2) = 0;
        }
        else {
            *((npy_ushort *)op1)= in1/in2;
            *((npy_ushort *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_INT == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_INT == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_INT == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_INT == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
             (*(npy_uint *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_uint *)op1) = 0;
        } else{
            *((npy_uint *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
             (*(npy_uint *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_uint *)op1) = 0;
        } else{
#if 0
            /* handle mixed case the way Python does */
            const npy_uint rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_uint *)op1) = rem;
            }
            else {
                *((npy_uint *)op1) = rem + in2;
            }
#else
            *((npy_uint *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
             (*(npy_uint *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 0
    BINARY_LOOP_TWO_OUT {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_uint *)op1) = 0;
                *((npy_uint *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_uint *)op1) = NPY_MIN_UINT;
                *((npy_uint *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_uint quo = in1 / in2;
            const npy_uint rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_uint *)op1) = quo;
                *((npy_uint *)op2) = rem;
            }
            else {
                *((npy_uint *)op1) = quo - 1;
                *((npy_uint *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_uint in1 = *(npy_uint *)ip1;
        const npy_uint in2 = *(npy_uint *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_uint *)op1) = 0;
            *((npy_uint *)op2) = 0;
        }
        else {
            *((npy_uint *)op1)= in1/in2;
            *((npy_uint *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_LONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_LONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_LONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_LONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
             (*(npy_ulong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ulong *)op1) = 0;
        } else{
            *((npy_ulong *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
             (*(npy_ulong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ulong *)op1) = 0;
        } else{
#if 0
            /* handle mixed case the way Python does */
            const npy_ulong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ulong *)op1) = rem;
            }
            else {
                *((npy_ulong *)op1) = rem + in2;
            }
#else
            *((npy_ulong *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
             (*(npy_ulong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 0
    BINARY_LOOP_TWO_OUT {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_ulong *)op1) = 0;
                *((npy_ulong *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_ulong *)op1) = NPY_MIN_ULONG;
                *((npy_ulong *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_ulong quo = in1 / in2;
            const npy_ulong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ulong *)op1) = quo;
                *((npy_ulong *)op2) = rem;
            }
            else {
                *((npy_ulong *)op1) = quo - 1;
                *((npy_ulong *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_ulong in1 = *(npy_ulong *)ip1;
        const npy_ulong in2 = *(npy_ulong *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_ulong *)op1) = 0;
            *((npy_ulong *)op2) = 0;
        }
        else {
            *((npy_ulong *)op1)= in1/in2;
            *((npy_ulong *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_LONGLONG == 8
    #if 0
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 16
    #if 0
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 32
    #if 0
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 64
    #if 0
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
             (*(npy_ulonglong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ulonglong *)op1) = 0;
        } else{
            *((npy_ulonglong *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
             (*(npy_ulonglong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
#if 0
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_ulonglong *)op1) = 0;
        } else{
#if 0
            /* handle mixed case the way Python does */
            const npy_ulonglong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ulonglong *)op1) = rem;
            }
            else {
                *((npy_ulonglong *)op1) = rem + in2;
            }
#else
            *((npy_ulonglong *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
             (*(npy_ulonglong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 0
    BINARY_LOOP_TWO_OUT {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_ulonglong *)op1) = 0;
                *((npy_ulonglong *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_ulonglong *)op1) = NPY_MIN_ULONGLONG;
                *((npy_ulonglong *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_ulonglong quo = in1 / in2;
            const npy_ulonglong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_ulonglong *)op1) = quo;
                *((npy_ulonglong *)op2) = rem;
            }
            else {
                *((npy_ulonglong *)op1) = quo - 1;
                *((npy_ulonglong *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_ulonglong *)op1) = 0;
            *((npy_ulonglong *)op2) = 0;
        }
        else {
            *((npy_ulonglong *)op1)= in1/in2;
            *((npy_ulonglong *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_BYTE == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_BYTE == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
             (*(npy_byte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_byte *)op1) = 0;
        } else{
            *((npy_byte *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
             (*(npy_byte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_byte *)op1) = 0;
        } else{
#if 1
            /* handle mixed case the way Python does */
            const npy_byte rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_byte *)op1) = rem;
            }
            else {
                *((npy_byte *)op1) = rem + in2;
            }
#else
            *((npy_byte *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
             (*(npy_byte *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 1
    BINARY_LOOP_TWO_OUT {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_byte *)op1) = 0;
                *((npy_byte *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_byte *)op1) = NPY_MIN_BYTE;
                *((npy_byte *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_byte quo = in1 / in2;
            const npy_byte rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_byte *)op1) = quo;
                *((npy_byte *)op2) = rem;
            }
            else {
                *((npy_byte *)op1) = quo - 1;
                *((npy_byte *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_byte in1 = *(npy_byte *)ip1;
        const npy_byte in2 = *(npy_byte *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_byte *)op1) = 0;
            *((npy_byte *)op2) = 0;
        }
        else {
            *((npy_byte *)op1)= in1/in2;
            *((npy_byte *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_SHORT == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_SHORT == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
             (*(npy_short *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_short *)op1) = 0;
        } else{
            *((npy_short *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
             (*(npy_short *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_short *)op1) = 0;
        } else{
#if 1
            /* handle mixed case the way Python does */
            const npy_short rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_short *)op1) = rem;
            }
            else {
                *((npy_short *)op1) = rem + in2;
            }
#else
            *((npy_short *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
             (*(npy_short *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 1
    BINARY_LOOP_TWO_OUT {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_short *)op1) = 0;
                *((npy_short *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_short *)op1) = NPY_MIN_SHORT;
                *((npy_short *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_short quo = in1 / in2;
            const npy_short rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_short *)op1) = quo;
                *((npy_short *)op2) = rem;
            }
            else {
                *((npy_short *)op1) = quo - 1;
                *((npy_short *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_short in1 = *(npy_short *)ip1;
        const npy_short in2 = *(npy_short *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_short *)op1) = 0;
            *((npy_short *)op2) = 0;
        }
        else {
            *((npy_short *)op1)= in1/in2;
            *((npy_short *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_INT == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_INT == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_INT == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_INT == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
             (*(npy_int *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_int *)op1) = 0;
        } else{
            *((npy_int *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
             (*(npy_int *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_int *)op1) = 0;
        } else{
#if 1
            /* handle mixed case the way Python does */
            const npy_int rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_int *)op1) = rem;
            }
            else {
                *((npy_int *)op1) = rem + in2;
            }
#else
            *((npy_int *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
             (*(npy_int *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 1
    BINARY_LOOP_TWO_OUT {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_int *)op1) = 0;
                *((npy_int *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_int *)op1) = NPY_MIN_INT;
                *((npy_int *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_int quo = in1 / in2;
            const npy_int rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_int *)op1) = quo;
                *((npy_int *)op2) = rem;
            }
            else {
                *((npy_int *)op1) = quo - 1;
                *((npy_int *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_int in1 = *(npy_int *)ip1;
        const npy_int in2 = *(npy_int *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_int *)op1) = 0;
            *((npy_int *)op2) = 0;
        }
        else {
            *((npy_int *)op1)= in1/in2;
            *((npy_int *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_LONG == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_LONG == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_LONG == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_LONG == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
             (*(npy_long *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_long *)op1) = 0;
        } else{
            *((npy_long *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
             (*(npy_long *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_long *)op1) = 0;
        } else{
#if 1
            /* handle mixed case the way Python does */
            const npy_long rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_long *)op1) = rem;
            }
            else {
                *((npy_long *)op1) = rem + in2;
            }
#else
            *((npy_long *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
             (*(npy_long *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 1
    BINARY_LOOP_TWO_OUT {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_long *)op1) = 0;
                *((npy_long *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_long *)op1) = NPY_MIN_LONG;
                *((npy_long *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_long quo = in1 / in2;
            const npy_long rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_long *)op1) = quo;
                *((npy_long *)op2) = rem;
            }
            else {
                *((npy_long *)op1) = quo - 1;
                *((npy_long *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_long in1 = *(npy_long *)ip1;
        const npy_long in2 = *(npy_long *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_long *)op1) = 0;
            *((npy_long *)op2) = 0;
        }
        else {
            *((npy_long *)op1)= in1/in2;
            *((npy_long *)op2) = in1 % in2;
        }
    }
#endif
}

#line 524
#undef TO_SIMD_SFX
#if 0
#line 529
#elif NPY_BITSOF_LONGLONG == 8
    #if 1
        #define TO_SIMD_SFX(X) X##_s8
    #else
        #define TO_SIMD_SFX(X) X##_u8
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 16
    #if 1
        #define TO_SIMD_SFX(X) X##_s16
    #else
        #define TO_SIMD_SFX(X) X##_u16
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 32
    #if 1
        #define TO_SIMD_SFX(X) X##_s32
    #else
        #define TO_SIMD_SFX(X) X##_u32
    #endif

#line 529
#elif NPY_BITSOF_LONGLONG == 64
    #if 1
        #define TO_SIMD_SFX(X) X##_s64
    #else
        #define TO_SIMD_SFX(X) X##_u64
    #endif

#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
             (*(npy_longlong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_longlong *)op1) = 0;
        } else{
            *((npy_longlong *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
             (*(npy_longlong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
#if 1
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((npy_longlong *)op1) = 0;
        } else{
#if 1
            /* handle mixed case the way Python does */
            const npy_longlong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_longlong *)op1) = rem;
            }
            else {
                *((npy_longlong *)op1) = rem + in2;
            }
#else
            *((npy_longlong *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
             (*(npy_longlong *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if 1
    BINARY_LOOP_TWO_OUT {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((npy_longlong *)op1) = 0;
                *((npy_longlong *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((npy_longlong *)op1) = NPY_MIN_LONGLONG;
                *((npy_longlong *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const npy_longlong quo = in1 / in2;
            const npy_longlong rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((npy_longlong *)op1) = quo;
                *((npy_longlong *)op2) = rem;
            }
            else {
                *((npy_longlong *)op1) = quo - 1;
                *((npy_longlong *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const npy_longlong in1 = *(npy_longlong *)ip1;
        const npy_longlong in2 = *(npy_longlong *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((npy_longlong *)op1) = 0;
            *((npy_longlong *)op2) = 0;
        }
        else {
            *((npy_longlong *)op1)= in1/in2;
            *((npy_longlong *)op2) = in1 % in2;
        }
    }
#endif
}


