///
///n21 to rgb
///
static uint8x8_t const loaduv(unsigned char const* uv)
{
return vld1_u8(uv);
}
static void store_pixel_block(unsigned char* dst, uint8x8x3_t& pblock, uint8x8_t const& r, uint8x8_t const& g, uint8x8_t const& b)
{
pblock.val[0] = r;
pblock.val[1] = g;
pblock.val[2] = b;
vst3_u8(dst, pblock);
}
static void Decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height)
{
// pre-condition : width, height must be even
//if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
// return;
// tmp variable
uint16x8_t tmp;
// in & out pointers
uint8_t* dst = out;
// constants
int const stride = width*s_bytes_per_pixel;
int const itHeight = height>>1;
int const itWidth = width>>3;
uint8x8_t const Yshift = vdup_n_u8(16);
int16x8_t const half = (int16x8_t)vdupq_n_u16(128);
int32x4_t const rounding = vdupq_n_s32(128);
// pixel block to temporary store 8 pixels
uint8x8x3_t pblock = uint8x8x3_t();
for (int j=0; j<itHeight; ++j, y+=width, dst+=stride)
{
for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*s_bytes_per_pixel))
{
tmp = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
int32x4_t const Y00 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
int32x4_t const Y01 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);
tmp = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
int32x4_t const Y10 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
int32x4_t const Y11 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);
// loaduv pack 4 sets of uv into a uint8x8_t, layout : { v0, u0, v1,u1, v2,u2, v3,u3 }
tmp = (uint16x8_t)vsubq_s16((int16x8_t)vmovl_u8(loaduv(uv)), half);
// tUV.val[0] : v0, v1, v2, v3
// tUV.val[1] : u0, u1, u2, u3
int16x4x2_t const tUV = vuzp_s16(vget_low_s16((int16x8_t)tmp), vget_high_s16((int16x8_t)tmp));
// tR : 128+409V
// tG : 128-100U-208V
// tB : 128+516U
int32x4_t const tR = vmlal_n_s16(rounding, tUV.val[0], 409);
int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, tUV.val[0], -208), tUV.val[1], -100);
int32x4_t const tB = vmlal_n_s16(rounding, tUV.val[1], 516);
int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]
// upper 8 pixels
store_pixel_block( dst,
pblock,
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));
// lower 8 pixels
store_pixel_block( dst+stride,
pblock,
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
}
}
}
void Interface_Yuv2Rgb(JNIEnv* env, jobject obj, jint width, jint height, jbyteArray datain, jbyteArray dataout)
{
jbyte* buffer_in = env->GetByteArrayElements(datain, NULL);
jbyte* buffer_out = env->GetByteArrayElements(dataout, NULL);
//faster than opencv convert color
Decode_yuv_neon((unsigned char*)buffer_out, (unsigned char*)buffer_in, (unsigned char*)buffer_in+(width*height), width, height);
env->ReleaseByteArrayElements(datain, buffer_in, 0);
env->ReleaseByteArrayElements(dataout, buffer_out, 0);
}
///
///n21 to rgba
///
static uint8x8_t const loaduv(unsigned char const* uv)
{
return vld1_u8(uv);
}
static void store_pixel_block(unsigned char* dst, uint8x8x4_t& pblock, uint8x8_t const& r, uint8x8_t const& g, uint8x8_t const& b)
{
pblock.val[0] = r;
pblock.val[1] = g;
pblock.val[2] = b;
vst4_u8(dst, pblock);
}
static void Decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha)
{
// pre-condition : width, height must be even
//if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
// return;
// tmp variable
uint16x8_t tmp;
// in & out pointers
uint8_t* dst = out;
// constants
int const stride = width*s_bytes_per_pixel;
int const itHeight = height>>1;
int const itWidth = width>>3;
uint8x8_t const Yshift = vdup_n_u8(16);
int16x8_t const half = (int16x8_t)vdupq_n_u16(128);
int32x4_t const rounding = vdupq_n_s32(128);
// pixel block to temporary store 8 pixels
uint8x8x4_t pblock;
pblock.val[3] = vdup_n_u8(fill_alpha); // alpha channel in the last
for (int j=0; j<itHeight; ++j, y+=width, dst+=stride)
{
for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*s_bytes_per_pixel))
{
tmp = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
int32x4_t const Y00 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
int32x4_t const Y01 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);
tmp = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
int32x4_t const Y10 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
int32x4_t const Y11 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);
// loaduv pack 4 sets of uv into a uint8x8_t, layout : { v0, u0, v1,u1, v2,u2, v3,u3 }
tmp = (uint16x8_t)vsubq_s16((int16x8_t)vmovl_u8(loaduv(uv)), half);
// tUV.val[0] : v0, v1, v2, v3
// tUV.val[1] : u0, u1, u2, u3
int16x4x2_t const tUV = vuzp_s16(vget_low_s16((int16x8_t)tmp), vget_high_s16((int16x8_t)tmp));
// tR : 128+409V
// tG : 128-100U-208V
// tB : 128+516U
int32x4_t const tR = vmlal_n_s16(rounding, tUV.val[0], 409);
int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, tUV.val[0], -208), tUV.val[1], -100);
int32x4_t const tB = vmlal_n_s16(rounding, tUV.val[1], 516);
int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]
// upper 8 pixels
store_pixel_block( dst,
pblock,
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));
// lower 8 pixels
store_pixel_block( dst+stride,
pblock,
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
}
}
}
void Interface_Yuv2Rgba(JNIEnv* env, jobject obj, jint width, jint height, jbyteArray datain, jbyteArray dataout)
{
jbyte* buffer_in = env->GetByteArrayElements(datain, NULL);
jbyte* buffer_out = env->GetByteArrayElements(dataout, NULL);
//faster
Decode_yuv_neon((unsigned char*)buffer_out, (unsigned char*)buffer_in, (unsigned char*)buffer_in+(width*height), width, height, 0xff);
env->ReleaseByteArrayElements(datain, buffer_in, 0);
env->ReleaseByteArrayElements(dataout, buffer_out, 0);
}
///
///Android.mk
///
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_MODULE := libfacecolor
LOCAL_CFLAGS := -std=c++11 -DHAVE_NEON=1 -mfpu=neon -mfloat-abi=softfp
LOCAL_C_INCLUDES := \
../cc_src
LOCAL_SRC_FILES := \
../cc_src/interfacefuncs.cpp
#LOCAL_SHARED_LIBRARIES := \
#libsdkprebuilt
LOCAL_LDLIBS := \
-lz -ldl -llog -pthread
#LOCAL_CFLAGS += -O0 -g
#LOCAL_CPPFLAGS += -O0 -g
#LOCAL_STRIP_MODULE := false
include $(BUILD_SHARED_LIBRARY)
沒有留言:
張貼留言