2013年9月14日 星期六

Android camera yuv to rgb rgba with arm neon

///
///n21 to rgb
///
static uint8x8_t const loaduv(unsigned char const* uv)
{
   return vld1_u8(uv);
}

static void store_pixel_block(unsigned char* dst, uint8x8x3_t& pblock, uint8x8_t const& r, uint8x8_t const& g, uint8x8_t const& b)
{
   pblock.val[0] = r;
   pblock.val[1] = g;
   pblock.val[2] = b;
   vst3_u8(dst, pblock);
}

static void Decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height)
{
    // pre-condition : width, height must be even
    //if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
    //    return;

    // tmp variable
    uint16x8_t tmp;

    // in & out pointers
    uint8_t* dst = out;

    // constants
    int const stride = width*s_bytes_per_pixel;
    int const itHeight = height>>1;
    int const itWidth = width>>3;

    uint8x8_t const Yshift = vdup_n_u8(16);
    int16x8_t const half = (int16x8_t)vdupq_n_u16(128);
    int32x4_t const rounding = vdupq_n_s32(128);

    // pixel block to temporary store 8 pixels
    uint8x8x3_t pblock = uint8x8x3_t();

    for (int j=0; j<itHeight; ++j, y+=width, dst+=stride)
    {
        for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*s_bytes_per_pixel))
        {
            tmp = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
            int32x4_t const Y00 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
            int32x4_t const Y01 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);

            tmp = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
            int32x4_t const Y10 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
            int32x4_t const Y11 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);

            // loaduv pack 4 sets of uv into a uint8x8_t, layout : { v0, u0, v1,u1, v2,u2, v3,u3 }
            tmp = (uint16x8_t)vsubq_s16((int16x8_t)vmovl_u8(loaduv(uv)), half);

            // tUV.val[0] : v0, v1, v2, v3
            // tUV.val[1] : u0, u1, u2, u3
            int16x4x2_t const tUV = vuzp_s16(vget_low_s16((int16x8_t)tmp), vget_high_s16((int16x8_t)tmp));

            // tR : 128+409V
            // tG : 128-100U-208V
            // tB : 128+516U
            int32x4_t const tR = vmlal_n_s16(rounding, tUV.val[0], 409);
            int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, tUV.val[0], -208), tUV.val[1], -100);
            int32x4_t const tB = vmlal_n_s16(rounding, tUV.val[1], 516);

            int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
            int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
            int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]

            // upper 8 pixels
            store_pixel_block(    dst,
                                                pblock,
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));

            // lower 8 pixels
            store_pixel_block(    dst+stride,
                                                pblock,
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
        }
    }
}


void Interface_Yuv2Rgb(JNIEnv* env, jobject obj, jint width, jint height, jbyteArray datain, jbyteArray dataout)
{
    jbyte*    buffer_in        = env->GetByteArrayElements(datain, NULL);
    jbyte*    buffer_out    = env->GetByteArrayElements(dataout, NULL);

    //faster than opencv convert color
    Decode_yuv_neon((unsigned char*)buffer_out,  (unsigned char*)buffer_in,  (unsigned char*)buffer_in+(width*height), width, height);

    env->ReleaseByteArrayElements(datain, buffer_in, 0);
    env->ReleaseByteArrayElements(dataout, buffer_out, 0);
}

///
///n21 to rgba
///
static uint8x8_t const loaduv(unsigned char const* uv)
{
   return vld1_u8(uv);
}

static void store_pixel_block(unsigned char* dst, uint8x8x4_t& pblock, uint8x8_t const& r, uint8x8_t const& g, uint8x8_t const& b)
{
    pblock.val[0] = r;
    pblock.val[1] = g;
    pblock.val[2] = b;
    vst4_u8(dst, pblock);
}

static void Decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha)
{
    // pre-condition : width, height must be even
    //if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
    //    return;

    // tmp variable
    uint16x8_t tmp;

    // in & out pointers
    uint8_t* dst = out;

    // constants
    int const stride = width*s_bytes_per_pixel;
    int const itHeight = height>>1;
    int const itWidth = width>>3;

    uint8x8_t const Yshift = vdup_n_u8(16);
    int16x8_t const half = (int16x8_t)vdupq_n_u16(128);
    int32x4_t const rounding = vdupq_n_s32(128);

    // pixel block to temporary store 8 pixels
    uint8x8x4_t pblock;
    pblock.val[3] = vdup_n_u8(fill_alpha); // alpha channel in the last

    for (int j=0; j<itHeight; ++j, y+=width, dst+=stride)
    {
        for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*s_bytes_per_pixel))
        {
            tmp = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
            int32x4_t const Y00 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
            int32x4_t const Y01 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);

            tmp = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
            int32x4_t const Y10 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_low_u16(tmp)), 298);
            int32x4_t const Y11 = (int32x4_t)vmulq_n_u32(vmovl_u16(vget_high_u16(tmp)), 298);

            // loaduv pack 4 sets of uv into a uint8x8_t, layout : { v0, u0, v1,u1, v2,u2, v3,u3 }
            tmp = (uint16x8_t)vsubq_s16((int16x8_t)vmovl_u8(loaduv(uv)), half);

            // tUV.val[0] : v0, v1, v2, v3
            // tUV.val[1] : u0, u1, u2, u3
            int16x4x2_t const tUV = vuzp_s16(vget_low_s16((int16x8_t)tmp), vget_high_s16((int16x8_t)tmp));

            // tR : 128+409V
            // tG : 128-100U-208V
            // tB : 128+516U
            int32x4_t const tR = vmlal_n_s16(rounding, tUV.val[0], 409);
            int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, tUV.val[0], -208), tUV.val[1], -100);
            int32x4_t const tB = vmlal_n_s16(rounding, tUV.val[1], 516);

            int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
            int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
            int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]

            // upper 8 pixels
            store_pixel_block(    dst,
                                                pblock,
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));

            // lower 8 pixels
            store_pixel_block(    dst+stride,
                                                pblock,
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
                                                vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
        }
    }
}

void Interface_Yuv2Rgba(JNIEnv* env, jobject obj, jint width, jint height, jbyteArray datain, jbyteArray dataout)
{
    jbyte*    buffer_in        = env->GetByteArrayElements(datain, NULL);
    jbyte*    buffer_out    = env->GetByteArrayElements(dataout, NULL);

    //faster
    Decode_yuv_neon((unsigned char*)buffer_out,  (unsigned char*)buffer_in,  (unsigned char*)buffer_in+(width*height), width, height, 0xff);

    env->ReleaseByteArrayElements(datain, buffer_in, 0);
    env->ReleaseByteArrayElements(dataout, buffer_out, 0);
}

///
///Android.mk
///
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_MODULE := libfacecolor
LOCAL_CFLAGS := -std=c++11 -DHAVE_NEON=1 -mfpu=neon -mfloat-abi=softfp

LOCAL_C_INCLUDES := \
../cc_src

LOCAL_SRC_FILES  := \
../cc_src/interfacefuncs.cpp

#LOCAL_SHARED_LIBRARIES := \
#libsdkprebuilt

LOCAL_LDLIBS := \
-lz -ldl -llog -pthread

#LOCAL_CFLAGS += -O0 -g
#LOCAL_CPPFLAGS += -O0 -g
#LOCAL_STRIP_MODULE := false
include $(BUILD_SHARED_LIBRARY)

沒有留言:

張貼留言