ew/ccv_nnc_ew_cpu

Bug Summary

File:	nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Warning:	line 1303, column 27 The right operand of '*' is a garbage value
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_ew_cpu_ref.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd -resource-dir /usr/local/lib/clang/19 -I ../../ -I .. -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2025-04-15-094535-608653-1 -x c ew/ccv_nnc_ew_cpu_ref.c
1#include "ccv.h"
2#include "ccv_internal.h"
3#include "nnc/ccv_nnc.h"
4#include "nnc/ccv_nnc_easy.h"
5#include "nnc/ccv_nnc_internal.h"
6#ifdef USE_OPENMP
7#include <omp.h>
8#endif
9#ifdef USE_DISPATCH
10#include <dispatch/dispatch.h>
11#endif

13#include "../_ccv_nnc_cpu_ref.h"

15void _ccv_nnc_ewsum_forw_cpu_ref_f32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16{
if (input_size == 1 && output_size == 1)
{
_ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
return;
}
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int x, z;
int k = 0;
// Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
for (z = 1; z < input_size; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = inputs[z];
if (c->data.f32 == a->data.f32)
{
	k = z;
	break;
}
}
for (z = 0; z < input_size - 1; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 45, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 46, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 47, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 49, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 50, __extension__ __PRETTY_FUNCTION__
); }));
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(a->info);
	for (x = 0; x < tensor_count; x++)
		c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
	continue;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 59, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
float* const cp = c->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
{
	// Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		float* cp0 = cp + i[0] * cstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				cp0[x] = ap0[x] + bp0[x];
			ap0 += astride[1];
			bp0 += bstride[1];
			cp0 += cstride[1];
		}
	}
	continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	float* const cp0 = cp + i[0] * cstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		float* cp1 = cp0 + i[1] * cstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
			ap1 += astride[2];
			bp1 += bstride[2];
			cp1 += cstride[2];
		}
	}
}
}
109}

111void _ccv_nnc_ewsum_forw_cpu_ref_i32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
112{
if (input_size == 1 && output_size == 1)
{
_ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
return;
}
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int x, z;
int k = 0;
// Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
for (z = 1; z < input_size; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = inputs[z];
if (c->data.f32 == a->data.f32)
{
	k = z;
	break;
}
}
for (z = 0; z < input_size - 1; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 141, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 142, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 143, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 145, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 146, __extension__ __PRETTY_FUNCTION__
); }));
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(a->info);
	for (x = 0; x < tensor_count; x++)
		c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
	continue;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 155, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
int* const ap = a->data.i32;
int* const bp = b->data.i32;
int* const cp = c->data.i32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
{
	// Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		int* ap0 = ap + i[0] * astride[0];
		int* bp0 = bp + i[0] * bstride[0];
		int* cp0 = cp + i[0] * cstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				cp0[x] = ap0[x] + bp0[x];
			ap0 += astride[1];
			bp0 += bstride[1];
			cp0 += cstride[1];
		}
	}
	continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	int* const ap0 = ap + i[0] * astride[0];
	int* const bp0 = bp + i[0] * bstride[0];
	int* const cp0 = cp + i[0] * cstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		int* ap1 = ap0 + i[1] * astride[1];
		int* bp1 = bp0 + i[1] * bstride[1];
		int* cp1 = cp0 + i[1] * cstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
			ap1 += astride[2];
			bp1 += bstride[2];
			cp1 += cstride[2];
		}
	}
}
}
205}

207static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
208{
if (outputs[0]->info.datatype == CCV_32S)
_ccv_nnc_ewsum_forw_cpu_ref_i32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
else
_ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
return CCV_NNC_EXEC_SUCCESS;
214}

216static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
217{
// D[x + y + z, x] = 1
int i;
if (inputs[0] == 0)
{
// Set them to 1.
for (i = 0; i < output_size; i++)
	if (outputs[i])
		_ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], 1);
} else {
// Copy over the gradient (If they are not pointing to the same tensor already).
for (i = 0; i < output_size; i++)
	if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
		_ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
}
return CCV_NNC_EXEC_SUCCESS;
233}

235void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
236{
if (input_size == 1 && output_size == 1)
{
_ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
return;
}
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int x, z;
int k = 0;
// Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
for (z = 1; z < input_size; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = inputs[z];
if (c->data.f32 == a->data.f32)
{
	k = z;
	break;
}
}
for (z = 0; z < input_size - 1; z++)
{
ccv_nnc_tensor_view_t* c = outputs[0];
ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 265, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 266, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 267, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 269, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 270, __extension__ __PRETTY_FUNCTION__
); }));
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(a->info);
	for (x = 0; x < tensor_count; x++)
		c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
	continue;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 279, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
float* const cp = c->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		float* cp0 = cp + i[0] * cstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				cp0[x] = ap0[x] * bp0[x];
			ap0 += astride[1];
			bp0 += bstride[1];
			cp0 += cstride[1];
		}
	}
	continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	float* const cp0 = cp + i[0] * cstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		float* cp1 = cp0 + i[1] * cstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				cp1[x] = ap1[x] * bp1[x];
			ap1 += astride[2];
			bp1 += bstride[2];
			cp1 += cstride[2];
		}
	}
}
}
329}

331static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
332{
_ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
return CCV_NNC_EXEC_SUCCESS;
335}

337static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338{
// D[x * y * z, x] = y * z
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int gstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int hstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int x, z;
ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
if (g == 0)
{
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 351, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_get_stride(b, bstride);
for (z = 0; z < output_size; z++)
{
	ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
	ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
	assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 358, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(h->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(h->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 359, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(a, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(a, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(a, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(a, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 360, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(h, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(h, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(h, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(h, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 361, __extension__ __PRETTY_FUNCTION__
); }));
	ccv_nnc_tensor_view_get_stride(a, astride);
	ccv_nnc_tensor_view_get_stride(h, hstride);
	if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(h)((*(int*)(h)) & CCV_TENSOR_VIEW))
	{
		// Super optimal case, just do one for-loop for sum.
		const int tensor_count = ccv_nnc_tensor_count(b->info);
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
		continue;
	}
	assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 372, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
	int i[CCV_NNC_MAX_DIM(2) + 2];
	float* const ap = a->data.f32;
	float* const bp = b->data.f32;
	float* const hp = h->data.f32;
	const int count = dim[2] * dim[3];
	if (astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* ap0 = ap + i[0] * astride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = bp0[x] / ap0[x];
				ap0 += astride[1];
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		continue;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const ap0 = ap + i[0] * astride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* ap1 = ap0 + i[1] * astride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = bp1[x] / ap1[x];
				ap1 += astride[2];
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
}
} else {
assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(g->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(g->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 420, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 421, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(b, dim);
assert(ccv_nnc_tensor_view_check_dim(g, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(g, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(g, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(g, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 423, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(g, gstride);
for (z = 0; z < output_size; z++)
{
	ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
	ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
	assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 430, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(h->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(h->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 431, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(a, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(a, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(a, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(a, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 432, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(h, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(h, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(h, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(h, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 433, __extension__ __PRETTY_FUNCTION__
); }));
	ccv_nnc_tensor_view_get_stride(a, astride);
	ccv_nnc_tensor_view_get_stride(h, hstride);
	if (!CCV_IS_TENSOR_VIEW(g)((*(int*)(g)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(h)((*(int*)(h)) & CCV_TENSOR_VIEW))
	{
		// Super optimal case, just do one for-loop for sum.
		const int tensor_count = ccv_nnc_tensor_count(g->info);
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
		continue;
	}
	assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 444, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
	int i[CCV_NNC_MAX_DIM(2) + 2];
	float* const gp = g->data.f32;
	float* const ap = a->data.f32;
	float* const bp = b->data.f32;
	float* const hp = h->data.f32;
	const int count = dim[2] * dim[3];
	if (gstride[2] == dim[3] && astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* ap0 = ap + i[0] * astride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = gp0[x] * bp0[x] / ap0[x];
				gp0 += gstride[1];
				ap0 += astride[1];
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		continue;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const ap0 = ap + i[0] * astride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* ap1 = ap0 + i[1] * astride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = gp1[x] * bp1[x] / ap1[x];
				gp1 += gstride[2];
				ap1 += astride[2];
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
499}

501static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
502{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
if (a == 0) // Take 0 as all ones tensor.
{
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 510, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 511, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(b, dim);
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 513, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(b->info);
	for (x = 0; x < tensor_count; x++)
		c->data.f32[x] = p / b->data.f32[x];
	return;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 523, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const bp = b->data.f32;
float* const cp = c->data.f32;
const int count = dim[2] * dim[3];
if (bstride[2] == dim[3] && cstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* bp0 = bp + i[0] * bstride[0];
		float* cp0 = cp + i[0] * cstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				cp0[x] = p / bp0[x];
			bp0 += bstride[1];
			cp0 += cstride[1];
		}
	}
	return;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const bp0 = bp + i[0] * bstride[0];
	float* const cp0 = cp + i[0] * cstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* bp1 = bp0 + i[1] * bstride[1];
		float* cp1 = cp0 + i[1] * cstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				cp1[x] = p / bp1[x];
			bp1 += bstride[2];
			cp1 += cstride[2];
		}
	}
}
} else {
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 566, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 567, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 568, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 570, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 571, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(a->info);
	for (x = 0; x < tensor_count; x++)
		c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
	return;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 581, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
float* const cp = c->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		float* cp0 = cp + i[0] * cstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				cp0[x] = p * ap0[x] / bp0[x];
			ap0 += astride[1];
			bp0 += bstride[1];
			cp0 += cstride[1];
		}
	}
	return;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	float* const cp0 = cp + i[0] * cstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		float* cp1 = cp0 + i[1] * cstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				cp1[x] = p * ap1[x] / bp1[x];
			ap1 += astride[2];
			bp1 += bstride[2];
			cp1 += cstride[2];
		}
	}
}
}
631}

633static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
634{
_ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
637}

639static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
640{
// D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
if (output_size == 1 || outputs[1] == 0)
{
// When we only need D[x / y, x]
_ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
}
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int gstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int hastride[CCV_NNC_MAX_DIM_ALLOC(12)];
int hbstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
if (g == 0)
{
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 661, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 662, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(hb->info.dim) <= (2)
 + 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(hb->
info.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 663, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(b, dim);
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 665, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(hb, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(hb, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(hb, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(hb, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 666, __extension__ __PRETTY_FUNCTION__
); }));
if (ha)
{
	assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(ha->info.dim) <= (2)
 + 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(ha->
info.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 669, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(ha, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(ha, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(ha, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(ha, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 670, __extension__ __PRETTY_FUNCTION__
); }));
}
int x;
if (!CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)((*(int*)(ha)) & CCV_TENSOR_VIEW)) && !CCV_IS_TENSOR_VIEW(hb)((*(int*)(hb)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(b->info);
	if (ha == 0)
	{
		for (x = 0; x < tensor_count; x++)
		{
			const float v = 1 / b->data.f32[x];
			hb->data.f32[x] = -c->data.f32[x] * v;
		}
	} else {
		for (x = 0; x < tensor_count; x++)
		{
			const float v = 1 / b->data.f32[x];
			ha->data.f32[x] = v;
			hb->data.f32[x] = -c->data.f32[x] * v;
		}
	}
	return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 694, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
ccv_nnc_tensor_view_get_stride(hb, hbstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const bp = b->data.f32;
float* const cp = c->data.f32;
float* const hbp = hb->data.f32;
const int count = dim[2] * dim[3];
if (ha == 0)
{
	if (bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* bp0 = bp + i[0] * bstride[0];
			float* cp0 = cp + i[0] * cstride[0];
			float* hbp0 = hbp + i[0] * hbstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
				{
					const float v = 1 / bp0[x];
					hbp0[x] = -cp0[x] * v;
				}
				bp0 += bstride[1];
				cp0 += cstride[1];
				hbp0 += hbstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const bp0 = bp + i[0] * bstride[0];
		float* const cp0 = cp + i[0] * cstride[0];
		float* const hbp0 = hbp + i[0] * hbstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* bp1 = bp0 + i[1] * bstride[1];
			float* cp1 = cp0 + i[1] * cstride[1];
			float* hbp1 = hbp0 + i[1] * hbstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
				{
					const float v = 1 / bp1[x];
					hbp1[x] = -cp1[x] * v;
				}
				bp1 += bstride[2];
				cp1 += cstride[2];
				hbp1 += hbstride[2];
			}
		}
	}
} else {
	float* const hap = ha->data.f32;
	ccv_nnc_tensor_view_get_stride(ha, hastride);
	if (bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* bp0 = bp + i[0] * bstride[0];
			float* cp0 = cp + i[0] * cstride[0];
			float* hap0 = hap + i[0] * hastride[0];
			float* hbp0 = hbp + i[0] * hbstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
				{
					const float v = 1 / bp0[x];
					hap0[x] = v;
					hbp0[x] = -cp0[x] * v;
				}
				bp0 += bstride[1];
				cp0 += cstride[1];
				hap0 += hastride[1];
				hbp0 += hbstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const bp0 = bp + i[0] * bstride[0];
		float* const cp0 = cp + i[0] * cstride[0];
		float* const hap0 = hap + i[0] * hastride[0];
		float* const hbp0 = hbp + i[0] * hbstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* bp1 = bp0 + i[1] * bstride[1];
			float* cp1 = cp0 + i[1] * cstride[1];
			float* hap1 = hap0 + i[1] * hastride[1];
			float* hbp1 = hbp0 + i[1] * hbstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
				{
					const float v = 1 / bp1[x];
					hap1[x] = v;
					hbp1[x] = -cp1[x] * v;
				}
				bp1 += bstride[2];
				cp1 += cstride[2];
				hap1 += hastride[2];
				hbp1 += hbstride[2];
			}
		}
	}
}
} else {
assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(g->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(g->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 809, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 810, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(c->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(c->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 811, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(hb->info.dim) <= (2)
 + 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(hb->
info.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 812, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(b, dim);
assert(ccv_nnc_tensor_view_check_dim(g, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(g, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(g, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(g, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 814, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(c, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(c, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(c, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(c, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 815, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_view_check_dim(hb, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(hb, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(hb, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(hb, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 816, __extension__ __PRETTY_FUNCTION__
); }));
if (ha)
{
	assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(ha->info.dim) <= (2)
 + 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(ha->
info.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 819, __extension__ __PRETTY_FUNCTION__
); }));
	assert(ccv_nnc_tensor_view_check_dim(ha, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(ha, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(ha, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(ha, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 820, __extension__ __PRETTY_FUNCTION__
); }));
}
int x;
if (!CCV_IS_TENSOR_VIEW(g)((*(int*)(g)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(c)((*(int*)(c)) & CCV_TENSOR_VIEW) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)((*(int*)(ha)) & CCV_TENSOR_VIEW)) && !CCV_IS_TENSOR_VIEW(hb)((*(int*)(hb)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(g->info);
	if (ha == 0)
	{
		for (x = 0; x < tensor_count; x++)
		{
			const float v = g->data.f32[x] / b->data.f32[x];
			hb->data.f32[x] = -c->data.f32[x] * v;
		}
	} else {
		for (x = 0; x < tensor_count; x++)
		{
			const float v = g->data.f32[x] / b->data.f32[x];
			ha->data.f32[x] = v;
			hb->data.f32[x] = -c->data.f32[x] * v;
		}
	}
	return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 844, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(g, gstride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(c, cstride);
ccv_nnc_tensor_view_get_stride(hb, hbstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const gp = g->data.f32;
float* const bp = b->data.f32;
float* const cp = c->data.f32;
float* const hbp = hb->data.f32;
const int count = dim[2] * dim[3];
if (ha == 0)
{
	if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* cp0 = cp + i[0] * cstride[0];
			float* hbp0 = hbp + i[0] * hbstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
				{
					const float v = gp0[x] / bp0[x];
					hbp0[x] = -cp0[x] * v;
				}
				gp0 += gstride[1];
				bp0 += bstride[1];
				cp0 += cstride[1];
				hbp0 += hbstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const cp0 = cp + i[0] * cstride[0];
		float* const hbp0 = hbp + i[0] * hbstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* cp1 = cp0 + i[1] * cstride[1];
			float* hbp1 = hbp0 + i[1] * hbstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
				{
					const float v = gp1[x] / bp1[x];
					hbp1[x] = -cp1[x] * v;
				}
				gp1 += gstride[2];
				bp1 += bstride[2];
				cp1 += cstride[2];
				hbp1 += hbstride[2];
			}
		}
	}
} else {
	ccv_nnc_tensor_view_get_stride(ha, hastride);
	float* const hap = ha->data.f32;
	if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
	{
		// Special casing if the ainc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* cp0 = cp + i[0] * cstride[0];
			float* hap0 = hap + i[0] * hastride[0];
			float* hbp0 = hbp + i[0] * hbstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
				{
					const float v = gp0[x] / bp0[x];
					hap0[x] = v;
					hbp0[x] = -cp0[x] * v;
				}
				gp0 += gstride[1];
				bp0 += bstride[1];
				cp0 += cstride[1];
				hap0 += hastride[1];
				hbp0 += hbstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const cp0 = cp + i[0] * cstride[0];
		float* const hap0 = hap + i[0] * hastride[0];
		float* const hbp0 = hbp + i[0] * hbstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* cp1 = cp0 + i[1] * cstride[1];
			float* hap1 = hap0 + i[1] * hastride[1];
			float* hbp1 = hbp0 + i[1] * hbstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
				{
					const float v = gp1[x] / bp1[x];
					hap1[x] = v;
					hbp1[x] = -cp1[x] * v;
				}
				gp1 += gstride[2];
				bp1 += bstride[2];
				cp1 += cstride[2];
				hap1 += hastride[2];
				hbp1 += hbstride[2];
			}
		}
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
972}

974static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
975{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 982, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 983, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 985, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
	b->data.f32[x] = exp(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 995, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* ap0 = ap + i[0] * astride[0];
	float* bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		for (x = 0; x < count; x++)
			bp0[x] = exp(ap0[x]);
		ap0 += astride[1];
		bp0 += bstride[1];
	}
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
float* const ap0 = ap + i[0] * astride[0];
float* const bp0 = bp + i[0] * bstride[0];
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
	float* ap1 = ap0 + i[1] * astride[1];
	float* bp1 = bp0 + i[1] * bstride[1];
	for (i[2] = 0; i[2] < dim[2]; i[2]++)
	{
		for (x = 0; x < dim[3]; x++)
			bp1[x] = exp(ap1[x]);
		ap1 += astride[2];
		bp1 += bstride[2];
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1038}

1040static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1041{
// D[Exp[x], x] = Exp[x]
if (inputs[0] == 0)
_ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
else
_ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
	(ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
}, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
return CCV_NNC_EXEC_SUCCESS;
1050}

1052static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1053{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1060, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1061, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1063, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
	b->data.f32[x] = log(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1073, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* ap0 = ap + i[0] * astride[0];
	float* bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		for (x = 0; x < count; x++)
			bp0[x] = log(ap0[x]);
		ap0 += astride[1];
		bp0 += bstride[1];
	}
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
float* const ap0 = ap + i[0] * astride[0];
float* const bp0 = bp + i[0] * bstride[0];
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
	float* ap1 = ap0 + i[1] * astride[1];
	float* bp1 = bp0 + i[1] * bstride[1];
	for (i[2] = 0; i[2] < dim[2]; i[2]++)
	{
		for (x = 0; x < dim[3]; x++)
			bp1[x] = log(ap1[x]);
		ap1 += astride[2];
		bp1 += bstride[2];
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1116}

1118static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1119{
// D[Log[x], x] = 1 / x
_ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
1123}

1125static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1126{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1133, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1134, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1136, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
	b->data.f32[x] = sqrt(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1146, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* ap0 = ap + i[0] * astride[0];
	float* bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		for (x = 0; x < count; x++)
			bp0[x] = sqrt(ap0[x]);
		ap0 += astride[1];
		bp0 += bstride[1];
	}
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
float* const ap0 = ap + i[0] * astride[0];
float* const bp0 = bp + i[0] * bstride[0];
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
	float* ap1 = ap0 + i[1] * astride[1];
	float* bp1 = bp0 + i[1] * bstride[1];
	for (i[2] = 0; i[2] < dim[2]; i[2]++)
	{
		for (x = 0; x < dim[3]; x++)
			bp1[x] = sqrt(ap1[x]);
		ap1 += astride[2];
		bp1 += bstride[2];
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1189}

1191static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1192{
// D[Sqrt[x], x] = 0.5 / Sqrt[x]
_ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
1196}

1198static int _ccv_nnc_ewabs_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1199{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1206, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1207, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1209, __extension__ __PRETTY_FUNCTION__
); }));
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
	b->data.f32[x] = fabs(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1219, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* ap0 = ap + i[0] * astride[0];
	float* bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		for (x = 0; x < count; x++)
			bp0[x] = fabs(ap0[x]);
		ap0 += astride[1];
		bp0 += bstride[1];
	}
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
float* const ap0 = ap + i[0] * astride[0];
float* const bp0 = bp + i[0] * bstride[0];
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
	float* ap1 = ap0 + i[1] * astride[1];
	float* bp1 = bp0 + i[1] * bstride[1];
	for (i[2] = 0; i[2] < dim[2]; i[2]++)
	{
		for (x = 0; x < dim[3]; x++)
			bp1[x] = fabs(ap1[x]);
		ap1 += astride[2];
		bp1 += bstride[2];
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1262}

1264static int _ccv_nnc_ewabs_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1265{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int gstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(g->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(g->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1274, __extension__ __PRETTY_FUNCTION__
); }));
1
Assuming the condition is true→
2
←
Taking true branch→
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1275, __extension__ __PRETTY_FUNCTION__
); }));
3
←
Assuming the condition is true→
4
←
Taking true branch→
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1276, __extension__ __PRETTY_FUNCTION__
); }));
5
←
Assuming the condition is true→
6
←
Taking true branch→
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(g, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(g, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(g, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(g, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1278, __extension__ __PRETTY_FUNCTION__
); }));
7
←
Assuming the condition is true→
8
←
Taking true branch→
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1279, __extension__ __PRETTY_FUNCTION__
); }));
9
←
Assuming the condition is true→
10
←
Taking true branch→
int x;
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(g)((*(int*)(g)) & CCV_TENSOR_VIEW))
11
←
Assuming the condition is false→
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
	b->data.f32[x] = a->data.f32[x] >= 0 ? g->data.f32[x] : -g->data.f32[x];
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1289, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
12
←
Taking true branch→
ccv_nnc_tensor_view_get_stride(g, astride);
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const gp = g->data.f32;
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (astride[2] == dim[3] && bstride[2] == dim[3])
13
←
Assuming the condition is true→
14
←
Assuming the condition is true→
15
←
Taking true branch→
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
16
←
Assuming the condition is true→
17
←
Loop condition is true.  Entering loop body→
{
	float* gp0 = gp + i[0] * gstride[0];
18
←
The right operand of '*' is a garbage value
	float* ap0 = ap + i[0] * astride[0];
	float* bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		for (x = 0; x < count; x++)
			bp0[x] = ap0[x] >= 0 ? gp0[x] : -gp0[x];
		gp0 += gstride[1];
		ap0 += astride[1];
		bp0 += bstride[1];
	}
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
float* const gp0 = gp + i[0] * gstride[0];
float* const ap0 = ap + i[0] * astride[0];
float* const bp0 = bp + i[0] * bstride[0];
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
	float* gp1 = gp0 + i[1] * gstride[1];
	float* ap1 = ap0 + i[1] * astride[1];
	float* bp1 = bp0 + i[1] * bstride[1];
	for (i[2] = 0; i[2] < dim[2]; i[2]++)
	{
		for (x = 0; x < dim[3]; x++)
			bp1[x] = ap1[x] >= 0 ? gp1[x] : -gp1[x];
		gp1 += gstride[2];
		ap1 += astride[2];
		bp1 += bstride[2];
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1339}

1341static int _ccv_nnc_clamp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1342{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int astride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(a->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(a->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1349, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1350, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(a, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1352, __extension__ __PRETTY_FUNCTION__
); }));
int x;
const float min = cmd.info.clamp.min;
const float max = cmd.info.clamp.max;
assert(!isnan(min) || !isnan(max))((void) sizeof ((!__builtin_isnan (min) || !__builtin_isnan (
max)) ? 1 : 0), __extension__ ({ if (!__builtin_isnan (min) ||
 !__builtin_isnan (max)) ; else __assert_fail ("!isnan(min) || !isnan(max)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1356, __extension__ __PRETTY_FUNCTION__
); }));
if (!CCV_IS_TENSOR_VIEW(a)((*(int*)(a)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
if (isnan(min)__builtin_isnan (min))
{
	for (x = 0; x < tensor_count; x++)
		b->data.f32[x] = ccv_min(a->data.f32[x], max)({ typeof (a->data.f32[x]) _a = (a->data.f32[x]); typeof
 (max) _b = (max); (_a < _b) ? _a : _b; });
} else if (isnan(max)__builtin_isnan (max)) {
	for (x = 0; x < tensor_count; x++)
		b->data.f32[x] = ccv_max(a->data.f32[x], min)({ typeof (a->data.f32[x]) _a = (a->data.f32[x]); typeof
 (min) _b = (min); (_a > _b) ? _a : _b; });
} else {
	for (x = 0; x < tensor_count; x++)
		b->data.f32[x] = ccv_clamp(a->data.f32[x], min, max)({ typeof (min) _a = (min); typeof (max) _b = (max); typeof (
a->data.f32[x]) _x = (a->data.f32[x]); (_x < _a) ? _a
 : ((_x > _b) ? _b : _x); });
}
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1374, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(a, astride);
ccv_nnc_tensor_view_get_stride(b, bstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const ap = a->data.f32;
float* const bp = b->data.f32;
const int count = dim[2] * dim[3];
if (isnan(min)__builtin_isnan (min))
{
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				bp0[x] = ccv_min(ap0[x], max)({ typeof (ap0[x]) _a = (ap0[x]); typeof (max) _b = (max); (_a
 < _b) ? _a : _b; });
			ap0 += astride[1];
			bp0 += bstride[1];
		}
	}
	return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				bp1[x] = ccv_min(ap1[x], max)({ typeof (ap1[x]) _a = (ap1[x]); typeof (max) _b = (max); (_a
 < _b) ? _a : _b; });
			ap1 += astride[2];
			bp1 += bstride[2];
		}
	}
}
} else if (isnan(max)__builtin_isnan (max)) {
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				bp0[x] = ccv_max(ap0[x], min)({ typeof (ap0[x]) _a = (ap0[x]); typeof (min) _b = (min); (_a
 > _b) ? _a : _b; });
			ap0 += astride[1];
			bp0 += bstride[1];
		}
	}
	return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				bp1[x] = ccv_max(ap1[x], min)({ typeof (ap1[x]) _a = (ap1[x]); typeof (min) _b = (min); (_a
 > _b) ? _a : _b; });
			ap1 += astride[2];
			bp1 += bstride[2];
		}
	}
}
} else {
if (astride[2] == dim[3] && bstride[2] == dim[3])
{
	// Special casing if the ainc[3] is the same as dim[3]
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* ap0 = ap + i[0] * astride[0];
		float* bp0 = bp + i[0] * bstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			for (x = 0; x < count; x++)
				bp0[x] = ccv_clamp(ap0[x], min, max)({ typeof (min) _a = (min); typeof (max) _b = (max); typeof (
ap0[x]) _x = (ap0[x]); (_x < _a) ? _a : ((_x > _b) ? _b
 : _x); });
			ap0 += astride[1];
			bp0 += bstride[1];
		}
	}
	return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
	float* const ap0 = ap + i[0] * astride[0];
	float* const bp0 = bp + i[0] * bstride[0];
	for (i[1] = 0; i[1] < dim[1]; i[1]++)
	{
		float* ap1 = ap0 + i[1] * astride[1];
		float* bp1 = bp0 + i[1] * bstride[1];
		for (i[2] = 0; i[2] < dim[2]; i[2]++)
		{
			for (x = 0; x < dim[3]; x++)
				bp1[x] = ccv_clamp(ap1[x], min, max)({ typeof (min) _a = (min); typeof (max) _b = (max); typeof (
ap1[x]) _x = (ap1[x]); (_x < _a) ? _a : ((_x > _b) ? _b
 : _x); });
			ap1 += astride[2];
			bp1 += bstride[2];
		}
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1492}

1494static int _ccv_nnc_clamp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1495{
assert(input_size == 3)((void) sizeof ((input_size == 3) ? 1 : 0), __extension__ ({ if
 (input_size == 3) ; else __assert_fail ("input_size == 3", "ew/ccv_nnc_ew_cpu_ref.c"
, 1496, __extension__ __PRETTY_FUNCTION__); }));
const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradient
const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
assert(output_size == 1)((void) sizeof ((output_size == 1) ? 1 : 0), __extension__ ({
 if (output_size == 1) ; else __assert_fail ("output_size == 1"
, "ew/ccv_nnc_ew_cpu_ref.c", 1499, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
int hstride[CCV_NNC_MAX_DIM_ALLOC(12)];
int bstride[CCV_NNC_MAX_DIM_ALLOC(12)];
assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(h->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(h->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1505, __extension__ __PRETTY_FUNCTION__
); }));
assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(b->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(b->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1506, __extension__ __PRETTY_FUNCTION__
); }));
ccv_nnc_tensor_view_get_dim(g, dim);
ccv_nnc_tensor_view_get_dim(h, dim);
assert(ccv_nnc_tensor_view_check_dim(b, dim))((void) sizeof ((ccv_nnc_tensor_view_check_dim(b, dim)) ? 1 :
 0), __extension__ ({ if (ccv_nnc_tensor_view_check_dim(b, dim
)) ; else __assert_fail ("ccv_nnc_tensor_view_check_dim(b, dim)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1509, __extension__ __PRETTY_FUNCTION__
); }));
int x;
const float min = cmd.info.clamp.min;
const float max = cmd.info.clamp.max;
assert(!isnan(min) || !isnan(max))((void) sizeof ((!__builtin_isnan (min) || !__builtin_isnan (
max)) ? 1 : 0), __extension__ ({ if (!__builtin_isnan (min) ||
 !__builtin_isnan (max)) ; else __assert_fail ("!isnan(min) || !isnan(max)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1513, __extension__ __PRETTY_FUNCTION__
); }));
if (g)
{
if (!CCV_IS_TENSOR_VIEW(g)((*(int*)(g)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(h)((*(int*)(h)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(g->info);
	if (isnan(min)__builtin_isnan (min))
	{
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = b->data.f32[x] >= max ? 0 : g->data.f32[x];
	} else if (isnan(max)__builtin_isnan (max)) {
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = b->data.f32[x] <= min ? 0 : g->data.f32[x];
	} else {
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : g->data.f32[x];
	}
	return CCV_NNC_EXEC_SUCCESS;
}
int gstride[CCV_NNC_MAX_DIM_ALLOC(12)];
assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2)((void) sizeof ((ccv_nnc_tensor_nd(g->info.dim) <= (2) +
 2) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_nd(g->info
.dim) <= (2) + 2) ; else __assert_fail ("ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2"
, "ew/ccv_nnc_ew_cpu_ref.c", 1534, __extension__ __PRETTY_FUNCTION__
); }));
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1535, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(g, gstride);
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(h, hstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const gp = g->data.f32;
float* const bp = b->data.f32;
float* const hp = h->data.f32;
const int count = dim[2] * dim[3];
const float min = cmd.info.clamp.min;
const float max = cmd.info.clamp.max;
assert(!isnan(min) || !isnan(max))((void) sizeof ((!__builtin_isnan (min) || !__builtin_isnan (
max)) ? 1 : 0), __extension__ ({ if (!__builtin_isnan (min) ||
 !__builtin_isnan (max)) ; else __assert_fail ("!isnan(min) || !isnan(max)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1546, __extension__ __PRETTY_FUNCTION__
); }));
if (isnan(min)__builtin_isnan (min))
{
	if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the ginc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = bp0[x] >= max ? 0 : gp0[x];
				gp0 += gstride[1];
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = bp1[x] >= max ? 0 : gp1[x];
				gp1 += gstride[2];
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
} else if (isnan(max)__builtin_isnan (max)) {
	if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the ginc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = bp0[x] <= min ? 0 : gp0[x];
				gp0 += gstride[1];
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = bp1[x] <= min ? 0 : gp1[x];
				gp1 += gstride[2];
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
} else {
	if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the ginc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* gp0 = gp + i[0] * gstride[0];
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : gp0[x];
				gp0 += gstride[1];
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const gp0 = gp + i[0] * gstride[0];
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* gp1 = gp0 + i[1] * gstride[1];
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : gp1[x];
				gp1 += gstride[2];
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
}
} else {
if (!CCV_IS_TENSOR_VIEW(h)((*(int*)(h)) & CCV_TENSOR_VIEW) && !CCV_IS_TENSOR_VIEW(b)((*(int*)(b)) & CCV_TENSOR_VIEW))
{
	// Super optimal case, just do one for-loop for sum.
	const int tensor_count = ccv_nnc_tensor_count(h->info);
	if (isnan(min)__builtin_isnan (min))
	{
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = b->data.f32[x] >= max ? 0 : 1;
	} else if (isnan(max)__builtin_isnan (max)) {
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = b->data.f32[x] <= min ? 0 : 1;
	} else {
		for (x = 0; x < tensor_count; x++)
			h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : 1;
	}
	return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "ew/ccv_nnc_ew_cpu_ref.c"
, 1690, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_stride(b, bstride);
ccv_nnc_tensor_view_get_stride(h, hstride);
int i[CCV_NNC_MAX_DIM(2) + 2];
float* const bp = b->data.f32;
float* const hp = h->data.f32;
const int count = dim[2] * dim[3];
const float min = cmd.info.clamp.min;
const float max = cmd.info.clamp.max;
assert(!isnan(min) || !isnan(max))((void) sizeof ((!__builtin_isnan (min) || !__builtin_isnan (
max)) ? 1 : 0), __extension__ ({ if (!__builtin_isnan (min) ||
 !__builtin_isnan (max)) ; else __assert_fail ("!isnan(min) || !isnan(max)"
, "ew/ccv_nnc_ew_cpu_ref.c", 1699, __extension__ __PRETTY_FUNCTION__
); }));
if (isnan(min)__builtin_isnan (min))
{
	if (bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the binc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = bp0[x] >= max ? 0 : 1;
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = bp1[x] >= max ? 0 : 1;
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
} else if (isnan(max)__builtin_isnan (max)) {
	if (bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the binc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = bp0[x] <= min ? 0 : 1;
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = bp1[x] <= min ? 0 : 1;
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
} else {
	if (bstride[2] == dim[3] && hstride[2] == dim[3])
	{
		// Special casing if the binc[3] is the same as dim[3]
		for (i[0] = 0; i[0] < dim[0]; i[0]++)
		{
			float* bp0 = bp + i[0] * bstride[0];
			float* hp0 = hp + i[0] * hstride[0];
			for (i[1] = 0; i[1] < dim[1]; i[1]++)
			{
				for (x = 0; x < count; x++)
					hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : 1;
				bp0 += bstride[1];
				hp0 += hstride[1];
			}
		}
		return CCV_NNC_EXEC_SUCCESS;
	}
	// Non-optimal case, need to do skip copy.
	for (i[0] = 0; i[0] < dim[0]; i[0]++)
	{
		float* const bp0 = bp + i[0] * bstride[0];
		float* const hp0 = hp + i[0] * hstride[0];
		for (i[1] = 0; i[1] < dim[1]; i[1]++)
		{
			float* bp1 = bp0 + i[1] * bstride[1];
			float* hp1 = hp0 + i[1] * hstride[1];
			for (i[2] = 0; i[2] < dim[2]; i[2]++)
			{
				for (x = 0; x < dim[3]; x++)
					hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : 1;
				bp1 += bstride[2];
				hp1 += hstride[2];
			}
		}
	}
}
}
return CCV_NNC_EXEC_SUCCESS;
1812}

1814REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWSUM_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1815{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsum_forw;
1821}

1823REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWSUM_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1824{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsum_back;
1830}

1832REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWPROD_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1833{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewprod_forw;
1839}

1841REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWPROD_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1842{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewprod_back;
1848}

1850REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWDIV_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1851{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewdiv_forw;
1857}

1859REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWDIV_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1860{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewdiv_back;
1866}

1868REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWEXP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1869{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewexp_forw;
1875}

1877REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWEXP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1878{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewexp_back;
1884}

1886REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWLOG_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1887{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewlog_forw;
1893}

1895REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWLOG_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1896{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewlog_back;
1902}

1904REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWSQRT_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1905{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsqrt_forw;
1911}

1913REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWSQRT_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1914{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsqrt_back;
1920}

1922REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWABS_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1923{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewabs_forw;
1929}

1931REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_EWABS_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1932{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewabs_back;
1938}

1940REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_CLAMP_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1941{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_clamp_forw;
1947}

1949REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_CLAMP_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
1950{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_clamp_back;
1956}