Now I found the fastest solution is assembly. Just like this:
__asm__ volatile(
"0: \n"
"# load 3 64-bit regs with interleave: \n"
"vld3.8 {d0,d1,d2}, [%0]! \n"
"# swap d0 and d2 - R and B\n"
"vswp d0, d2 \n"
"# store 4 64-bit regs: \n"
"vst4.8 {d0,d1,d2,d3}, [%2]! \n"
"subs %1, %1, #1 \n"
"bne 0b \n"
:
: "r"(img), "r"(numPixels24), "+r"(bgraData)
: "r6", "r7","r8"
);
solved How to quickly transform from RGB24 to BGRA on iOS?