90     const uint8_t *mm_end       = end - 3;
 
   93         register unsigned x = *((
const uint32_t *)s);
 
   94         *((uint32_t *)d)    = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
 
   99         register unsigned short x = *((
const uint16_t *)s);
 
  100         *((uint16_t *)d)          = (x & 0x7FFF) + (x & 0x7FE0);
 
  109     const uint8_t *mm_end       = end - 3;
 
  112         register uint32_t x  = *((
const uint32_t *)s);
 
  113         *((uint32_t *)d)     = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
 
  118         register uint16_t x = *((
const uint16_t *)s);
 
  119         *((uint16_t *)d)    = ((x >> 1) & 0x7FE0) | (x & 0x001F);
 
  125     uint16_t *d        = (uint16_t *)dst;
 
  130         register int rgb  = *(
const uint32_t *)s;
 
  132         *d++              = ((rgb & 0xFF)     >> 3) +
 
  133                             ((rgb & 0xFC00)   >> 5) +
 
  134                             ((rgb & 0xF80000) >> 8);
 
  141     uint16_t *d        = (uint16_t *)dst;
 
  146         register int rgb  = *(
const uint32_t *)s;
 
  148         *d++              = ((rgb & 0xF8)     << 8) +
 
  149                             ((rgb & 0xFC00)   >> 5) +
 
  150                             ((rgb & 0xF80000) >> 19);
 
  156     uint16_t *d        = (uint16_t *)dst;
 
  161         register int rgb  = *(
const uint32_t *)s;
 
  163         *d++              = ((rgb & 0xFF)     >> 3) +
 
  164                             ((rgb & 0xF800)   >> 6) +
 
  165                             ((rgb & 0xF80000) >> 9);
 
  172     uint16_t *d        = (uint16_t *)dst;
 
  177         register int rgb  = *(
const uint32_t *)s;
 
  179         *d++              = ((rgb & 0xF8)     <<  7) +
 
  180                             ((rgb & 0xF800)   >>  6) +
 
  181                             ((rgb & 0xF80000) >> 19);
 
  188     uint16_t *d        = (uint16_t *)dst;
 
  196         *d++        = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
 
  202     uint16_t *d        = (uint16_t *)dst;
 
  210         *d++        = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
 
  217     uint16_t *d        = (uint16_t *)dst;
 
  225         *d++        = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
 
  231     uint16_t *d        = (uint16_t *)dst;
 
  239         *d++        = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
 
  247     const uint16_t *
s   = (
const uint16_t *)src;
 
  248     const uint16_t *
end = s + src_size / 2;
 
  251         register uint16_t bgr = *s++;
 
  252         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  253         *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
 
  254         *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 
  262     const uint16_t *
s   = (
const uint16_t *)src;
 
  263     const uint16_t *
end = s + src_size / 2;
 
  266         register uint16_t bgr = *s++;
 
  267         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  268         *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
 
  269         *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 
  276     const uint16_t *
s   = (
const uint16_t *)src;
 
  277     const uint16_t *
end = s + src_size / 2;
 
  280         register uint16_t bgr = *s++;
 
  283         *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 
  284         *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
 
  285         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  287         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  288         *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
 
  289         *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 
  298     const uint16_t *
s   = (
const uint16_t *)src;
 
  299     const uint16_t *
end = s + src_size / 2;
 
  302         register uint16_t bgr = *s++;
 
  305         *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 
  306         *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
 
  307         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  309         *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 
  310         *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
 
  311         *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 
  320     int idx          = 15  - src_size;
 
  324     for (; idx < 15; idx += 4) {
 
  325         register int v        = *(
const uint32_t *)&s[idx], 
g = v & 0xff00ff00;
 
  327         *(uint32_t *)&d[idx]  = (v >> 16) + 
g + (v << 16);
 
  335     for (i = 0; i < src_size; i += 3) {
 
  336         register uint8_t x = src[i + 2];
 
  337         dst[i + 1]         = src[i + 1];
 
  338         dst[i + 2]         = src[i + 0];
 
  346                                      int lumStride, 
int chromStride,
 
  347                                      int dstStride, 
int vertLumPerChroma)
 
  350     const int chromWidth = width >> 1;
 
  352     for (y = 0; y < 
height; y++) {
 
  354         uint64_t *ldst = (uint64_t *)dst;
 
  355         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
  356         for (i = 0; i < chromWidth; i += 2) {
 
  357             uint64_t k = yc[0] + (uc[0] << 8) +
 
  358                          (yc[1] << 16) + (unsigned)(vc[0] << 24);
 
  359             uint64_t l = yc[2] + (uc[1] << 8) +
 
  360                          (yc[3] << 16) + (unsigned)(vc[1] << 24);
 
  361             *ldst++ = k + (l << 32);
 
  369         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
  371         for (i = 0; i < chromWidth; i++) {
 
  373             *idst++ = (yc[0] << 24) + (uc[0] << 16) +
 
  374                       (yc[1] <<  8) + (vc[0] <<  0);
 
  376             *idst++ = yc[0] + (uc[0] << 8) +
 
  377                       (yc[1] << 16) + (vc[0] << 24);
 
  384         if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
 
  400                                 int chromStride, 
int dstStride)
 
  404                       chromStride, dstStride, 2);
 
  410                                      int lumStride, 
int chromStride,
 
  411                                      int dstStride, 
int vertLumPerChroma)
 
  414     const int chromWidth = width >> 1;
 
  416     for (y = 0; y < 
height; y++) {
 
  418         uint64_t *ldst = (uint64_t *)dst;
 
  419         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
  420         for (i = 0; i < chromWidth; i += 2) {
 
  421             uint64_t k = uc[0] + (yc[0] << 8) +
 
  422                          (vc[0] << 16) + (unsigned)(yc[1] << 24);
 
  423             uint64_t l = uc[1] + (yc[2] << 8) +
 
  424                          (vc[1] << 16) + (unsigned)(yc[3] << 24);
 
  425             *ldst++ = k + (l << 32);
 
  433         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
  435         for (i = 0; i < chromWidth; i++) {
 
  437             *idst++ = (uc[0] << 24) + (yc[0] << 16) +
 
  438                       (vc[0] <<  8) + (yc[1] <<  0);
 
  440             *idst++ = uc[0] + (yc[0] << 8) +
 
  441                       (vc[0] << 16) + (yc[1] << 24);
 
  448         if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
 
  464                                 int chromStride, 
int dstStride)
 
  468                       chromStride, dstStride, 2);
 
  477                                    int chromStride, 
int dstStride)
 
  480                       chromStride, dstStride, 1);
 
  489                                    int chromStride, 
int dstStride)
 
  492                       chromStride, dstStride, 1);
 
  502                                 int chromStride, 
int srcStride)
 
  505     const int chromWidth = width >> 1;
 
  507     for (y = 0; y < 
height; y += 2) {
 
  509         for (i = 0; i < chromWidth; i++) {
 
  510             ydst[2 * i + 0] = src[4 * i + 0];
 
  511             udst[i]         = src[4 * i + 1];
 
  512             ydst[2 * i + 1] = src[4 * i + 2];
 
  513             vdst[i]         = src[4 * i + 3];
 
  518         for (i = 0; i < chromWidth; i++) {
 
  519             ydst[2 * i + 0] = src[4 * i + 0];
 
  520             ydst[2 * i + 1] = src[4 * i + 2];
 
  530                               int srcHeight, 
int srcStride, 
int dstStride)
 
  537     for (x = 0; x < srcWidth - 1; x++) {
 
  538         dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
 
  539         dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
 
  541     dst[2 * srcWidth - 1] = src[srcWidth - 1];
 
  545     for (y = 1; y < srcHeight; y++) {
 
  546         const int mmxSize = 1;
 
  548         dst[0]         = (src[0] * 3 + src[srcStride]) >> 2;
 
  549         dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
 
  551         for (x = mmxSize - 1; x < srcWidth - 1; x++) {
 
  552             dst[2 * x + 1]             = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
 
  553             dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
 
  554             dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride])     >> 2;
 
  555             dst[2 * x + 2]             = (src[x + 1] * 3 + src[x + srcStride])     >> 2;
 
  557         dst[srcWidth * 2 - 1]             = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
 
  558         dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
 
  560         dst += dstStride * 2;
 
  567     for (x = 0; x < srcWidth - 1; x++) {
 
  568         dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
 
  569         dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
 
  571     dst[2 * srcWidth - 1] = src[srcWidth - 1];
 
  583                                 int chromStride, 
int srcStride)
 
  586     const int chromWidth = width >> 1;
 
  588     for (y = 0; y < 
height; y += 2) {
 
  590         for (i = 0; i < chromWidth; i++) {
 
  591             udst[i]         = src[4 * i + 0];
 
  592             ydst[2 * i + 0] = src[4 * i + 1];
 
  593             vdst[i]         = src[4 * i + 2];
 
  594             ydst[2 * i + 1] = src[4 * i + 3];
 
  599         for (i = 0; i < chromWidth; i++) {
 
  600             ydst[2 * i + 0] = src[4 * i + 1];
 
  601             ydst[2 * i + 1] = src[4 * i + 3];
 
  619                    int chromStride, 
int srcStride, 
int32_t *rgb2yuv)
 
  625     const int chromWidth = width >> 1;
 
  627     for (y = 0; y < 
height; y += 2) {
 
  629         for (i = 0; i < chromWidth; i++) {
 
  630             unsigned int b = src[6 * i + 0];
 
  631             unsigned int g = src[6 * i + 1];
 
  632             unsigned int r = src[6 * i + 2];
 
  655         for (i = 0; i < chromWidth; i++) {
 
  656             unsigned int b = src[6 * i + 0];
 
  657             unsigned int g = src[6 * i + 1];
 
  658             unsigned int r = src[6 * i + 2];
 
  680                               int src1Stride, 
int src2Stride, 
int dstStride)
 
  684     for (h = 0; h < 
height; h++) {
 
  686         for (w = 0; w < 
width; w++) {
 
  687             dest[2 * w + 0] = src1[w];
 
  688             dest[2 * w + 1] = src2[w];
 
  699                                  int srcStride1, 
int srcStride2,
 
  700                                  int dstStride1, 
int dstStride2)
 
  706     for (y = 0; y < h; y++) {
 
  707         const uint8_t *
s1 = src1 + srcStride1 * (y >> 1);
 
  709         for (x = 0; x < w; x++)
 
  710             d[2 * x] = d[2 * x + 1] = s1[x];
 
  712     for (y = 0; y < h; y++) {
 
  713         const uint8_t *
s2 = src2 + srcStride2 * (y >> 1);
 
  715         for (x = 0; x < w; x++)
 
  716             d[2 * x] = d[2 * x + 1] = s2[x];
 
  723                                   int srcStride1, 
int srcStride2,
 
  724                                   int srcStride3, 
int dstStride)
 
  730     for (y = 0; y < h; y++) {
 
  731         const uint8_t *yp = src1 + srcStride1 *  
y;
 
  732         const uint8_t *up = src2 + srcStride2 * (y >> 2);
 
  733         const uint8_t *vp = src3 + srcStride3 * (y >> 2);
 
  735         for (x = 0; x < w; x++) {
 
  736             const int x2 = x << 2;
 
  737             d[8 * x + 0] = yp[x2];
 
  738             d[8 * x + 1] = up[x];
 
  739             d[8 * x + 2] = yp[x2 + 1];
 
  740             d[8 * x + 3] = vp[x];
 
  741             d[8 * x + 4] = yp[x2 + 2];
 
  742             d[8 * x + 5] = up[x];
 
  743             d[8 * x + 6] = yp[x2 + 3];
 
  744             d[8 * x + 7] = vp[x];
 
  768         dst0[
count] = src[4 * count + 0];
 
  769         dst1[
count] = src[4 * count + 2];
 
  783         dst0[
count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
 
  784         dst1[
count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
 
  798         dst0[
count] = src[4 * count + 0];
 
  799         dst1[
count] = src[4 * count + 2];
 
  815         dst0[
count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
 
  816         dst1[
count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
 
  823                            int lumStride, 
int chromStride, 
int srcStride)
 
  828     for (y = 0; y < 
height; y++) {
 
  843                            int lumStride, 
int chromStride, 
int srcStride)
 
  848     for (y = 0; y < 
height; y++) {
 
  861                            int lumStride, 
int chromStride, 
int srcStride)
 
  866     for (y = 0; y < 
height; y++) {
 
  881                            int lumStride, 
int chromStride, 
int srcStride)
 
  886     for (y = 0; y < 
height; y++) {