;// compilation : ;// "as toto.s -o toto.o ;// "ld toto.o -o toto ;// The _fill function is juste to have predictive result when debugging with gdb in Eclipse ;// And so to be able to verify operations and pixel results in memory .section .data .align 16 ; // to ensure 16bytes alignement image: .space 256*256*4; // 256*256*(4 bytes/pixel) format : RGBARGBARGBA.... imagelen: .long 256*256*4; // take care to have size multiple of 4pixels // take care to have @image 128bits aligned, if not change all "movdqa" with "movdqu" // "movdqa = move doublequad aligned", "movdqu = move doublequad unaligned" charmaskinlong: .long 0x000000FF; // we compute all coefficients as 65536th coefR: .long 77; // 0.3 *256 coefG: .long 151; // 0.59 *256 coefB: .long 28; // 0.11 *256 // 77+151+28 = 256 ok, we shouldn't overflow 255 .section .text .global _start _start: call _fill; movd charmaskinlong,%xmm7; // load charmask in XMM7=[000000000000000000000000000000FF] pshufd $0,%xmm7,%xmm7; // copy charmask on each packeted 32bits // XMM7==[000000FF000000FF000000FF000000FF] movd coefR,%xmm6; // load coefR in XMM6=[000000000000000000000000000000cR] pshufd $0,%xmm6,%xmm6; // XMM6.D=[cR cR cR cR] , XMM7.W=[0 cR 0 cR 0 cR 0 cR] movd coefG,%xmm5; // load coefG in XMM5=[000000000000000000000000000000cG] pshufd $0,%xmm5,%xmm5; // XMM5.D=[cG cG cG cG] , XMM5.W=[0 cG 0 cG 0 cG 0 cG] pslld $16,%xmm5; // XMM5.W=[cG 0 cG 0 cG 0 cG 0] por %xmm5,%xmm6; // XMM6.W=[cG cR cG cR cG cR cG cR] movd coefB,%xmm4; // load coefB in XMM7=[000000000000000000000000000000cB] pshufd $0,%xmm4,%xmm4; // XMM4.D=[cB cB cB cB] , XMM4.W=[0 cB 0 cB 0 cB 0 cB] movdqa %xmm7,%xmm5; // XMM5.D=[255 255 255 255] pslld $16,%xmm4; // XMM4.W=[cB 0 cB 0 cB 0 cB 0] por %xmm4,%xmm5; // XMM5.W=[cB 255 cB 255 cB 255 cB 255] movl $image,%eax; // put ptr image in EAX movl imagelen,%ecx; // put size in ECX sar $4,%ecx; // divide size by 16 (4 pixels / pass) _loop4: // the goal is (R*cR+G*cG+B*cB)/256 (cR,cG,cB are *256) // we will do ( [(R*cR + G*cG)/256]*255 + B*cB)/256 -> 2 Packed MultiplyAdd operation movdqa (%eax),%xmm0; // load 4pixels in XMM0 (4 * 32bits = 128bits) pshufd $0xE4,%xmm7,%xmm1; // copy XMM7 to XMM1 (using shifting SSE unit not load unit) pand %xmm0,%xmm1; // XMM1.B=[00 00 00 RR 00 00 00 RR 00 00 00 RR 00 00 00 RR] pshufd $0xE4,%xmm7,%xmm2; // copy XMM7 to XMM2 (using shifting SSE unit not load unit) pslld $8,%xmm2; // XMM2.B=[00 00 FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00] pand %xmm0,%xmm2; // XMM1.B=[00 00 GG 00 00 00 GG 00 00 00 GG 00 00 00 GG 00] pslld $8,%xmm2; // XMM1.B=[00 GG 00 00 00 GG 00 00 00 GG 00 00 00 GG 00 00] por %xmm2,%xmm1; // XMM1.W=[G R G R G R G R] pmaddwd %xmm6,%xmm1; // XMM1.D=[g*cG+R*cR g*cG+R*cR g*cG+R*cR g*cG+R*cR] psrad $8,%xmm1; // XMM1.W=[0 (g*cG+R*cR)/256 4x] movdqa %xmm7,%xmm2; // Copy XMM7 to XMM2 using load SSE unit pslld $16,%xmm2; // XMM2.B=[00 FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00 00] pand %xmm0,%xmm2; // XMM2.B=[00 BB 00 00 00 BB 00 00 00 BB 00 00 00 BB 00 00] por %xmm2,%xmm1; // XMM1.W=[BB (g*cG+R*cR)/256 BB 4x] pmaddwd %xmm5,%xmm1; // XMM1.D=[((g*cG+R*cR)/256)*255+B*cB 4x] // XMM1.D~=[(g*cG+R*cR+B*cB) 4x] psrad $8,%xmm1; // XMM1.D~=[(g*cG+R*cR+B*cB)/256 4x] movdqa %xmm7,%xmm2; // Copy XMM7 to XMM2 using load SSE unit pslld $24,%xmm2; // XMM2.B=[FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00 00 00] pand %xmm0,%xmm2; // XMM2.B=[AA 00 00 00 AA 00 00 00 AA 00 00 00 AA 00 00 00] por %xmm1,%xmm2; // XMM2.B=[AA 00 00 Gr AA 00 00 Gr AA 00 00 Gr AA 00 00 Gr] movdqa %xmm1,%xmm3; // XMM3.D~=[Grey Grey Grey Grey] pslld $8,%xmm3; // XMM3.B=[00 00 Gr 00 00 00 Gr 00 00 00 Gr 00 00 00 Gr 00] por %xmm3,%xmm2; // XMM2.B=[AA 00 Gr Gr AA 00 Gr Gr AA 00 Gr Gr AA 00 Gr Gr] pslld $8,%xmm3; // XMM3.B=[00 Gr 00 00 00 Gr 00 00 00 Gr 00 00 00 Gr 00 00] por %xmm3,%xmm2; // XMM2.B=[AA Gr Gr Gr AA Gr Gr Gr AA Gr Gr Gr AA Gr Gr Gr] movdqa %xmm2,(%eax); // store 4pixels packed Grey converted add $16,%eax; // next frame is 4*4bytes=16bytes further dec %ecx; // decrement 4pixels packed count jnz _loop4; // continue works :) _end: mov $1,%eax; // 1 => exit mov $0,%ebx; // 0 return code int $0x80; // bye bye :) ;// Addon function to fill the picture _fill: movl $image,%eax; // put ptr image in EAX xor %ebx,%ebx; // clear EBX movl imagelen,%ecx; // put size in ECX _loop2: movb %bl,(%eax); // put counter in pixel component inc %bl; // increment counter inc %eax; // increment pixel component pointer dec %ecx; // decrement byte count jnz _loop2; // continue if not finished ret; // come back to the main code
Vous n'êtes pas encore membre ?
inscrivez-vous, c'est gratuit et ça prend moins d'une minute !
Les membres obtiennent plus de réponses que les utilisateurs anonymes.
Le fait d'être membre vous permet d'avoir un suivi détaillé de vos demandes et codes sources.
Le fait d'être membre vous permet d'avoir des options supplémentaires.