Ticket #9789: blitters.asm

File blitters.asm, 4.8 KB (added by scanty, 11 years ago)

mmx optimised blitters

Line 
1; optimised blitters
2; initial code thanks to Caz Jones
3
4bits 32
5
6global blit_windowed_dirty_mmx
7global blit_2x_mmx
8global blit_2x_dirty_mmx
9global blit_2x_windowed_dirty_mmx
10global blit_overlay
11
12
13align 16
14blit_2x_mmx:
15 push edi
16 push esi
17
18 mov edi, [esp+12] ; dest
19 mov esi, [esp+16] ; source
20 mov ecx, [esp+20] ; size
21 shr ecx, 3
22
23.loop:
24 movq mm0, [esi]
25 movq mm1, mm0
26 punpcklbw mm0, mm0
27 punpckhbw mm1, mm1
28 movq [edi], mm0
29 movq [edi+8], mm1
30 add edi, 16
31 add esi, 8
32 sub ecx, 1
33 jnz .loop
34
35 pop esi
36 pop edi
37
38 emms
39ret
40
41
42align 16
43blit_2x_dirty_mmx:
44 push ebp
45 mov ebp, esp
46
47 push edi
48 push esi
49 push ebx
50 push edx
51
52 mov edi, [ebp+8] ; dest
53 mov esi, [ebp+12] ; source
54 mov eax, [ebp+16] ; dirty
55 mov edx, [ebp+20] ; rowbytes
56 mov ecx, [ebp+24] ; size
57 shr ecx, 3
58
59.loop:
60 movq mm0, [esi]
61 movq mm1, [eax]
62
63 pcmpeqb mm1, mm0
64 packsswb mm1, mm1
65 movd ebx, mm1
66 cmp ebx, 0xffffffff
67 jz .skip
68
69 movq [eax], mm0
70 movq mm1, mm0
71 punpcklbw mm0, mm0
72 punpckhbw mm1, mm1
73 movq [edi], mm0
74 movq [edi+8], mm1
75
76 ; omit these two for scanlines
77 movq [edi+edx], mm0 ; next line
78 movq [edi+edx+8], mm1
79
80.skip:
81 add edi, 16
82 add esi, 8
83 add eax, 8
84 sub ecx, 1
85 jnz .loop
86
87 pop edx
88 pop ebx
89 pop esi
90 pop edi
91
92 mov esp, ebp
93 pop ebp
94
95 emms
96ret
97
98
99blit_windowed_dirty_mmx:
100 push ebp
101 mov ebp, esp
102 push edi
103 push esi
104 push ebx
105
106 mov esi, [ebp+8] ; src
107 mov eax, [ebp+12] ; dirty
108 mov edi, [ebp+16] ; dst
109 mov ecx, [ebp+20] ; size
110
111 shr ecx, 4
112 jz near .finish
113
114 cmp [ebp+24], dword 2 ; 16 bit
115 jz .loop16
116
117 cmp [ebp+24], dword 1 ; 8 bit
118 jz near .loop8
119
120.loop32:
121 movq mm0, [esi]
122 movq mm1, [eax]
123 movq mm2, [esi+8]
124 movq mm3, [eax+8]
125
126 pcmpeqd mm1, mm0
127 pcmpeqd mm3, mm2
128
129 psrld mm1, 31 ; ffffffffffffffff -> 0000000100000001
130 psrld mm3, 31 ; ffffffffffffffff -> 0000000100000001
131 pslld mm3, 1 ; 0000000100000001 -> 0000000200000002
132 por mm1, mm3 ; 0000000300000003
133
134 packsswb mm1, mm1 ; 0000000300000003 -> 00030003
135 movd ebx, mm1
136 cmp ebx, 0x00030003
137 jz .skip32
138
139 movq [eax], mm0
140 movq [eax+8], mm2
141 movq [edi], mm0
142 movq [edi+8], mm2
143
144.skip32
145 add esi, 16
146 add eax, 16
147 add edi, 16
148 dec ecx
149 jnz .loop32
150 jz near .finish
151
152.loop16:
153 movq mm0, [esi]
154 movq mm1, [eax]
155 movq mm2, [esi+8]
156 movq mm3, [eax+8]
157
158 pcmpeqw mm1, mm0
159 pcmpeqw mm3, mm2
160
161 psrlw mm1, 15
162 psrlw mm3, 15
163 psllw mm3, 1
164 por mm1, mm3
165
166 packsswb mm1, mm1
167 movd ebx, mm1
168 cmp ebx, 0x03030303
169 jz .skip16
170
171 movq [eax], mm0
172 movq [eax+8], mm2
173 movq [edi], mm0
174 movq [edi+8], mm2
175
176.skip16
177 add esi, 16
178 add eax, 16
179 add edi, 16
180 dec ecx
181 jnz .loop16
182 jz .finish
183
184.loop8:
185 movq mm0, [esi]
186 movq mm1, [eax]
187 movq mm2, [esi+8]
188 movq mm3, [eax+8]
189
190 pcmpeqw mm1, mm0
191 pcmpeqw mm3, mm2
192
193 psrlw mm1, 8
194 psllw mm3, 8
195 por mm1, mm3
196
197 packsswb mm1, mm1
198 movd ebx, mm1
199 cmp ebx, 0xffffffff
200 jz .skip8
201
202 movq [eax], mm0
203 movq [eax+8], mm2
204 movq [edi], mm0
205 movq [edi+8], mm2
206
207.skip8
208 add esi, 16
209 add eax, 16
210 add edi, 16
211 dec ecx
212 jnz .loop8
213
214.finish
215
216 pop ebx
217 pop esi
218 pop edi
219 mov esp, ebp
220 pop ebp
221 emms
222ret
223
224
225
226blit_2x_windowed_dirty_mmx:
227 push ebp
228 mov ebp, esp
229 push edi
230 push esi
231 push ebx
232 push edx
233
234 mov esi, [ebp+8] ; src
235 mov eax, [ebp+12] ; dirty
236 mov edi, [ebp+16] ; dst
237 mov ecx, [ebp+20] ; size
238 mov edx, [ebp+28] ; bytes_per_row
239
240 shr ecx, 4
241 jz near .finish
242
243 cmp [ebp+24], dword 2 ; 16 bit
244 jz .loop16
245
246 cmp [ebp+24], dword 1 ; 8 bit
247 jz near .loop8
248
249.loop32:
250 movq mm0, [esi]
251 movq mm1, [eax]
252
253 pcmpeqd mm1, mm0
254 packsswb mm1, mm1
255 movd ebx, mm1
256 cmp ebx, 0xffffffff
257 jz .skip32
258
259 movq [eax], mm0
260 movq mm1, mm0
261
262 punpckldq mm0, mm0
263 punpckhdq mm1, mm1
264 movq [edi], mm0
265 movq [edi+8], mm1
266
267 movq [edi+edx], mm0 ; remove for scanlines
268 movq [edi+edx+8], mm1 ; remove for scanlines
269
270.skip32
271 add esi, 8
272 add eax, 8
273 add edi, 16
274 dec ecx
275 jnz .loop32
276 jz near .finish
277
278.loop16:
279 movq mm0, [esi]
280 movq mm1, [eax]
281
282 pcmpeqw mm1, mm0
283 packsswb mm1, mm1
284 movd ebx, mm1
285 cmp ebx, 0xffffffff
286 jz .skip16
287
288 movq [eax], mm0
289 movq mm1, mm0
290
291 punpcklwd mm0, mm0
292 punpckhwd mm1, mm1
293 movq [edi], mm0
294 movq [edi+8], mm1
295
296 movq [edi+edx], mm0 ; remove for scanlines
297 movq [edi+edx+8], mm1 ; remove for scanlines
298
299.skip16
300 add esi, 8
301 add eax, 8
302 add edi, 16
303 dec ecx
304 jnz .loop16
305 jz .finish
306
307.loop8:
308 movq mm0, [esi]
309 movq mm1, [eax]
310
311 pcmpeqb mm1, mm0
312 packsswb mm1, mm1
313 movd ebx, mm1
314 cmp ebx, 0xffffffff
315 jz .skip8
316
317 movq [eax], mm0
318 movq mm1, mm0
319
320 punpcklbw mm0, mm0
321 punpckhbw mm1, mm1
322 movq [edi], mm0
323 movq [edi+8], mm1
324
325 movq [edi+edx], mm0 ; remove for scanlines
326 movq [edi+edx+8], mm1 ; remove for scanlines
327
328.skip8
329 add esi, 8
330 add eax, 8
331 add edi, 16
332 dec ecx
333 jnz .loop8
334
335.finish
336
337 pop edx
338 pop ebx
339 pop esi
340 pop edi
341 mov esp, ebp
342 pop ebp
343 emms
344ret
345
346
347blit_overlay:
348 push edi
349 push esi
350
351 mov edi, [esp+12] ; dest
352 mov esi, [esp+16] ; source
353 mov ecx, [esp+20] ; size
354 mov eax, [esp+24] ; y
355 mov edx, [esp+28] ; ycbcr
356
357 push ecx
358
359
360.loop:
361 sub ecx, 1
362 jnz .loop
363
364 pop esi
365 pop edi
366ret