Codebase list xrdp / adb43db
Re-enable SIMD code (minus the two files) on any-i386 mirabilos 7 years ago
5 changed file(s) with 1408 addition(s) and 25 deletion(s). Raw diff Collapse all Expand all
1919 libx11-dev,
2020 libxfixes-dev,
2121 libxrandr-dev,
22 # for now; see debian/rules for details
23 nasm [amd64 kfreebsd-amd64],
22 nasm [amd64 hurd-i386 i386 kfreebsd-amd64 kfreebsd-i386],
2423 openssl,
2524 pkg-config,
2625 systemd [linux-any],
00 From: Thorsten Glaser <tg@mirbsd.org>
11 Subject: Free the ebx register from use of the assembly code (WIP)
2 missing: xorgxrdp/module/x86/ i420_to_rgb32_x86_sse2.asm yv12_to_rgb32_x86_sse2.asm
23
34 --- a/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
45 +++ b/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
0 From: Thorsten Glaser <tg@mirbsd.org>
1 Subject: Convert i386 asm code to ELF PIC (WIP)
2
3 --- a/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
4 +++ b/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
5 @@ -1,5 +1,6 @@
6 ;
7 ;Copyright 2016 Jay Sorg
8 +;Copyright 2017 mirabilos
9 ;
10 ;Permission to use, copy, modify, distribute, and sell this software and its
11 ;documentation for any purpose is hereby granted without fee, provided that
12 @@ -21,6 +22,29 @@
13
14 %ifidn __OUTPUT_FORMAT__,elf
15 section .note.GNU-stack noalloc noexec nowrite progbits
16 +%ifdef PIC
17 +section .text
18 +extern _GLOBAL_OFFSET_TABLE_
19 +.get_GOT:
20 + mov ebx,dword ptr [esp]
21 + ret
22 +%define lsym(name) ebx + name wrt ..gotoff
23 +%macro get_GOT 0
24 + call .get_GOT
25 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
26 +%endmacro
27 +%else
28 +%endif
29 +%else
30 +; not ELF
31 +%ifdef PIC
32 +%error Position-Independent Code is currently only supported for ELF
33 +%endif
34 +%endif
35 +%ifndef lsym
36 +%define lsym(name) name
37 +%macro get_GOT 0
38 +%endmacro
39 %endif
40
41 section .data
42 @@ -68,8 +92,8 @@ loop1a:
43 movdqa xmm2, [esi + 16]
44 movdqa xmm6, xmm1
45 movdqa xmm7, xmm2
46 - pand xmm1, [cdFFFF]
47 - pand xmm2, [cdFFFF]
48 + pand xmm1, [lsym(cdFFFF)]
49 + pand xmm2, [lsym(cdFFFF)]
50 pslld xmm1, 16
51 pslld xmm2, 16
52 psrad xmm1, 16
53 @@ -79,8 +103,8 @@ loop1a:
54 movdqa xmm3, xmm7
55 psrldq xmm2, 2
56 psrldq xmm3, 2
57 - pand xmm2, [cdFFFF]
58 - pand xmm3, [cdFFFF]
59 + pand xmm2, [lsym(cdFFFF)]
60 + pand xmm3, [lsym(cdFFFF)]
61 pslld xmm2, 16
62 pslld xmm3, 16
63 psrad xmm2, 16
64 @@ -98,8 +122,8 @@ loop1a:
65 psrldq xmm5, 12
66 pslldq xmm5, 12
67 por xmm4, xmm5
68 - pand xmm3, [cdFFFF]
69 - pand xmm4, [cdFFFF]
70 + pand xmm3, [lsym(cdFFFF)]
71 + pand xmm4, [lsym(cdFFFF)]
72 pslld xmm3, 16
73 pslld xmm4, 16
74 psrad xmm3, 16
75 @@ -260,8 +284,8 @@ loop1c:
76 movdqa xmm2, [esi + 16]
77 movdqa xmm6, xmm1
78 movdqa xmm7, xmm2
79 - pand xmm1, [cdFFFF]
80 - pand xmm2, [cdFFFF]
81 + pand xmm1, [lsym(cdFFFF)]
82 + pand xmm2, [lsym(cdFFFF)]
83 pslld xmm1, 16
84 pslld xmm2, 16
85 psrad xmm1, 16
86 @@ -271,8 +295,8 @@ loop1c:
87 movdqa xmm3, xmm7
88 psrldq xmm2, 2
89 psrldq xmm3, 2
90 - pand xmm2, [cdFFFF]
91 - pand xmm3, [cdFFFF]
92 + pand xmm2, [lsym(cdFFFF)]
93 + pand xmm3, [lsym(cdFFFF)]
94 pslld xmm2, 16
95 pslld xmm3, 16
96 psrad xmm2, 16
97 @@ -290,8 +314,8 @@ loop1c:
98 movd xmm5, eax
99 pslldq xmm5, 12
100 por xmm4, xmm5
101 - pand xmm3, [cdFFFF]
102 - pand xmm4, [cdFFFF]
103 + pand xmm3, [lsym(cdFFFF)]
104 + pand xmm4, [lsym(cdFFFF)]
105 pslld xmm3, 16
106 pslld xmm4, 16
107 psrad xmm3, 16
108 @@ -341,8 +365,8 @@ loop1c:
109 movdqa xmm2, [esi + 16]
110 movdqa xmm6, xmm1
111 movdqa xmm7, xmm2
112 - pand xmm1, [cdFFFF]
113 - pand xmm2, [cdFFFF]
114 + pand xmm1, [lsym(cdFFFF)]
115 + pand xmm2, [lsym(cdFFFF)]
116 pslld xmm1, 16
117 pslld xmm2, 16
118 psrad xmm1, 16
119 @@ -352,8 +376,8 @@ loop1c:
120 movdqa xmm3, xmm7
121 psrldq xmm2, 2
122 psrldq xmm3, 2
123 - pand xmm2, [cdFFFF]
124 - pand xmm3, [cdFFFF]
125 + pand xmm2, [lsym(cdFFFF)]
126 + pand xmm3, [lsym(cdFFFF)]
127 pslld xmm2, 16
128 pslld xmm3, 16
129 psrad xmm2, 16
130 @@ -371,8 +395,8 @@ loop1c:
131 psrldq xmm5, 12
132 pslldq xmm5, 12
133 por xmm4, xmm5
134 - pand xmm3, [cdFFFF]
135 - pand xmm4, [cdFFFF]
136 + pand xmm3, [lsym(cdFFFF)]
137 + pand xmm4, [lsym(cdFFFF)]
138 pslld xmm3, 16
139 pslld xmm4, 16
140 psrad xmm3, 16
141 @@ -436,8 +460,8 @@ loop1c1:
142 movdqa xmm2, [esi + 16]
143 movdqa xmm6, xmm1
144 movdqa xmm7, xmm2
145 - pand xmm1, [cdFFFF]
146 - pand xmm2, [cdFFFF]
147 + pand xmm1, [lsym(cdFFFF)]
148 + pand xmm2, [lsym(cdFFFF)]
149 pslld xmm1, 16
150 pslld xmm2, 16
151 psrad xmm1, 16
152 @@ -447,8 +471,8 @@ loop1c1:
153 movdqa xmm3, xmm7
154 psrldq xmm2, 2
155 psrldq xmm3, 2
156 - pand xmm2, [cdFFFF]
157 - pand xmm3, [cdFFFF]
158 + pand xmm2, [lsym(cdFFFF)]
159 + pand xmm3, [lsym(cdFFFF)]
160 pslld xmm2, 16
161 pslld xmm3, 16
162 psrad xmm2, 16
163 @@ -466,8 +490,8 @@ loop1c1:
164 movd xmm5, eax
165 pslldq xmm5, 12
166 por xmm4, xmm5
167 - pand xmm3, [cdFFFF]
168 - pand xmm4, [cdFFFF]
169 + pand xmm3, [lsym(cdFFFF)]
170 + pand xmm4, [lsym(cdFFFF)]
171 pslld xmm3, 16
172 pslld xmm4, 16
173 psrad xmm3, 16
174 @@ -514,8 +538,8 @@ loop1c1:
175 movdqa xmm2, [esi + 16]
176 movdqa xmm6, xmm1
177 movdqa xmm7, xmm2
178 - pand xmm1, [cdFFFF]
179 - pand xmm2, [cdFFFF]
180 + pand xmm1, [lsym(cdFFFF)]
181 + pand xmm2, [lsym(cdFFFF)]
182 pslld xmm1, 16
183 pslld xmm2, 16
184 psrad xmm1, 16
185 @@ -525,8 +549,8 @@ loop1c1:
186 movdqa xmm3, xmm7
187 psrldq xmm2, 2
188 psrldq xmm3, 2
189 - pand xmm2, [cdFFFF]
190 - pand xmm3, [cdFFFF]
191 + pand xmm2, [lsym(cdFFFF)]
192 + pand xmm3, [lsym(cdFFFF)]
193 pslld xmm2, 16
194 pslld xmm3, 16
195 psrad xmm2, 16
196 @@ -544,8 +568,8 @@ loop1c1:
197 psrldq xmm5, 12
198 pslldq xmm5, 12
199 por xmm4, xmm5
200 - pand xmm3, [cdFFFF]
201 - pand xmm4, [cdFFFF]
202 + pand xmm3, [lsym(cdFFFF)]
203 + pand xmm4, [lsym(cdFFFF)]
204 pslld xmm3, 16
205 pslld xmm4, 16
206 psrad xmm3, 16
207 @@ -703,8 +727,8 @@ loop1e:
208 movdqa xmm2, [esi + 16]
209 movdqa xmm6, xmm1
210 movdqa xmm7, xmm2
211 - pand xmm1, [cdFFFF]
212 - pand xmm2, [cdFFFF]
213 + pand xmm1, [lsym(cdFFFF)]
214 + pand xmm2, [lsym(cdFFFF)]
215 pslld xmm1, 16
216 pslld xmm2, 16
217 psrad xmm1, 16
218 @@ -714,8 +738,8 @@ loop1e:
219 movdqa xmm3, xmm7
220 psrldq xmm2, 2
221 psrldq xmm3, 2
222 - pand xmm2, [cdFFFF]
223 - pand xmm3, [cdFFFF]
224 + pand xmm2, [lsym(cdFFFF)]
225 + pand xmm3, [lsym(cdFFFF)]
226 pslld xmm2, 16
227 pslld xmm3, 16
228 psrad xmm2, 16
229 @@ -733,8 +757,8 @@ loop1e:
230 movd xmm5, eax
231 pslldq xmm5, 12
232 por xmm4, xmm5
233 - pand xmm3, [cdFFFF]
234 - pand xmm4, [cdFFFF]
235 + pand xmm3, [lsym(cdFFFF)]
236 + pand xmm4, [lsym(cdFFFF)]
237 pslld xmm3, 16
238 pslld xmm4, 16
239 psrad xmm3, 16
240 @@ -787,8 +811,8 @@ loop2e:
241 movdqa xmm2, [esi + 16]
242 movdqa xmm6, xmm1
243 movdqa xmm7, xmm2
244 - pand xmm1, [cdFFFF]
245 - pand xmm2, [cdFFFF]
246 + pand xmm1, [lsym(cdFFFF)]
247 + pand xmm2, [lsym(cdFFFF)]
248 pslld xmm1, 16
249 pslld xmm2, 16
250 psrad xmm1, 16
251 @@ -798,8 +822,8 @@ loop2e:
252 movdqa xmm3, xmm7
253 psrldq xmm2, 2
254 psrldq xmm3, 2
255 - pand xmm2, [cdFFFF]
256 - pand xmm3, [cdFFFF]
257 + pand xmm2, [lsym(cdFFFF)]
258 + pand xmm3, [lsym(cdFFFF)]
259 pslld xmm2, 16
260 pslld xmm3, 16
261 psrad xmm2, 16
262 @@ -817,8 +841,8 @@ loop2e:
263 movd xmm5, eax
264 pslldq xmm5, 12
265 por xmm4, xmm5
266 - pand xmm3, [cdFFFF]
267 - pand xmm4, [cdFFFF]
268 + pand xmm3, [lsym(cdFFFF)]
269 + pand xmm4, [lsym(cdFFFF)]
270 pslld xmm3, 16
271 pslld xmm4, 16
272 psrad xmm3, 16
273 @@ -870,8 +894,8 @@ loop2e:
274 movdqa xmm2, [esi + 16]
275 movdqa xmm6, xmm1
276 movdqa xmm7, xmm2
277 - pand xmm1, [cdFFFF]
278 - pand xmm2, [cdFFFF]
279 + pand xmm1, [lsym(cdFFFF)]
280 + pand xmm2, [lsym(cdFFFF)]
281 pslld xmm1, 16
282 pslld xmm2, 16
283 psrad xmm1, 16
284 @@ -881,8 +905,8 @@ loop2e:
285 movdqa xmm3, xmm7
286 psrldq xmm2, 2
287 psrldq xmm3, 2
288 - pand xmm2, [cdFFFF]
289 - pand xmm3, [cdFFFF]
290 + pand xmm2, [lsym(cdFFFF)]
291 + pand xmm3, [lsym(cdFFFF)]
292 pslld xmm2, 16
293 pslld xmm3, 16
294 psrad xmm2, 16
295 @@ -900,8 +924,8 @@ loop2e:
296 psrldq xmm5, 12
297 pslldq xmm5, 12
298 por xmm4, xmm5
299 - pand xmm3, [cdFFFF]
300 - pand xmm4, [cdFFFF]
301 + pand xmm3, [lsym(cdFFFF)]
302 + pand xmm4, [lsym(cdFFFF)]
303 pslld xmm3, 16
304 pslld xmm4, 16
305 psrad xmm3, 16
306 @@ -965,8 +989,8 @@ loop1e1:
307 movdqa xmm2, [esi + 16]
308 movdqa xmm6, xmm1
309 movdqa xmm7, xmm2
310 - pand xmm1, [cdFFFF]
311 - pand xmm2, [cdFFFF]
312 + pand xmm1, [lsym(cdFFFF)]
313 + pand xmm2, [lsym(cdFFFF)]
314 pslld xmm1, 16
315 pslld xmm2, 16
316 psrad xmm1, 16
317 @@ -976,8 +1000,8 @@ loop1e1:
318 movdqa xmm3, xmm7
319 psrldq xmm2, 2
320 psrldq xmm3, 2
321 - pand xmm2, [cdFFFF]
322 - pand xmm3, [cdFFFF]
323 + pand xmm2, [lsym(cdFFFF)]
324 + pand xmm3, [lsym(cdFFFF)]
325 pslld xmm2, 16
326 pslld xmm3, 16
327 psrad xmm2, 16
328 @@ -995,8 +1019,8 @@ loop1e1:
329 movd xmm5, eax
330 pslldq xmm5, 12
331 por xmm4, xmm5
332 - pand xmm3, [cdFFFF]
333 - pand xmm4, [cdFFFF]
334 + pand xmm3, [lsym(cdFFFF)]
335 + pand xmm4, [lsym(cdFFFF)]
336 pslld xmm3, 16
337 pslld xmm4, 16
338 psrad xmm3, 16
339 @@ -1046,8 +1070,8 @@ loop2e1:
340 movdqa xmm2, [esi + 16]
341 movdqa xmm6, xmm1
342 movdqa xmm7, xmm2
343 - pand xmm1, [cdFFFF]
344 - pand xmm2, [cdFFFF]
345 + pand xmm1, [lsym(cdFFFF)]
346 + pand xmm2, [lsym(cdFFFF)]
347 pslld xmm1, 16
348 pslld xmm2, 16
349 psrad xmm1, 16
350 @@ -1057,8 +1081,8 @@ loop2e1:
351 movdqa xmm3, xmm7
352 psrldq xmm2, 2
353 psrldq xmm3, 2
354 - pand xmm2, [cdFFFF]
355 - pand xmm3, [cdFFFF]
356 + pand xmm2, [lsym(cdFFFF)]
357 + pand xmm3, [lsym(cdFFFF)]
358 pslld xmm2, 16
359 pslld xmm3, 16
360 psrad xmm2, 16
361 @@ -1076,8 +1100,8 @@ loop2e1:
362 movd xmm5, eax
363 pslldq xmm5, 12
364 por xmm4, xmm5
365 - pand xmm3, [cdFFFF]
366 - pand xmm4, [cdFFFF]
367 + pand xmm3, [lsym(cdFFFF)]
368 + pand xmm4, [lsym(cdFFFF)]
369 pslld xmm3, 16
370 pslld xmm4, 16
371 psrad xmm3, 16
372 @@ -1126,8 +1150,8 @@ loop2e1:
373 movdqa xmm2, [esi + 16]
374 movdqa xmm6, xmm1
375 movdqa xmm7, xmm2
376 - pand xmm1, [cdFFFF]
377 - pand xmm2, [cdFFFF]
378 + pand xmm1, [lsym(cdFFFF)]
379 + pand xmm2, [lsym(cdFFFF)]
380 pslld xmm1, 16
381 pslld xmm2, 16
382 psrad xmm1, 16
383 @@ -1137,8 +1161,8 @@ loop2e1:
384 movdqa xmm3, xmm7
385 psrldq xmm2, 2
386 psrldq xmm3, 2
387 - pand xmm2, [cdFFFF]
388 - pand xmm3, [cdFFFF]
389 + pand xmm2, [lsym(cdFFFF)]
390 + pand xmm3, [lsym(cdFFFF)]
391 pslld xmm2, 16
392 pslld xmm3, 16
393 psrad xmm2, 16
394 @@ -1156,8 +1180,8 @@ loop2e1:
395 psrldq xmm5, 12
396 pslldq xmm5, 12
397 por xmm4, xmm5
398 - pand xmm3, [cdFFFF]
399 - pand xmm4, [cdFFFF]
400 + pand xmm3, [lsym(cdFFFF)]
401 + pand xmm4, [lsym(cdFFFF)]
402 pslld xmm3, 16
403 pslld xmm4, 16
404 psrad xmm3, 16
405 @@ -1220,9 +1244,9 @@ loop1f:
406 punpcklbw xmm1, xmm0
407 punpcklbw xmm2, xmm0
408 punpcklbw xmm3, xmm0
409 - psubw xmm1, [cw128]
410 - psubw xmm2, [cw128]
411 - psubw xmm3, [cw128]
412 + psubw xmm1, [lsym(cw128)]
413 + psubw xmm2, [lsym(cw128)]
414 + psubw xmm3, [lsym(cw128)]
415 psllw xmm1, 5
416 psllw xmm2, 5
417 psllw xmm3, 5
418 @@ -1254,8 +1278,8 @@ loop2f:
419 movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
420 punpcklbw xmm2, xmm0
421 punpcklbw xmm3, xmm0
422 - psubw xmm2, [cw128]
423 - psubw xmm3, [cw128]
424 + psubw xmm2, [lsym(cw128)]
425 + psubw xmm3, [lsym(cw128)]
426 psllw xmm2, 5
427 psllw xmm3, 5
428 movdqa xmm4, xmm1
429 @@ -1287,7 +1311,7 @@ loop2f:
430 movdqa xmm1, xmm3 ; src[2n]
431 movq xmm2, [esi + 64 * 1] ; src[2n + 1]
432 punpcklbw xmm2, xmm0
433 - psubw xmm2, [cw128]
434 + psubw xmm2, [lsym(cw128)]
435 psllw xmm2, 5
436 movdqa xmm4, xmm1
437 movdqa xmm5, xmm2
438 @@ -1328,7 +1352,7 @@ set_quants_hi:
439 movd xmm1, eax
440 movdqa LHI_SFT, xmm1
441 imul eax, 16
442 - lea edx, [cwa0]
443 + lea edx, [lsym(cwa0)]
444 add edx, eax
445 movdqa xmm1, [edx]
446 movdqa LHI_ADD, xmm1
447 @@ -1339,7 +1363,7 @@ set_quants_lo:
448 movd xmm1, eax
449 movdqa LLO_SFT, xmm1
450 imul eax, 16
451 - lea edx, [cwa0]
452 + lea edx, [lsym(cwa0)]
453 add edx, eax
454 movdqa xmm1, [edx]
455 movdqa LLO_ADD, xmm1
456 @@ -1375,6 +1399,7 @@ PROC _rfxcodec_encode_dwt_shift_x86_sse2
457 movdqu [esp], xmm0
458 ; save registers
459 push ebx
460 + get_GOT
461 push esi
462 push edi
463 push ebp
464 --- a/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
465 +++ b/librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
466 @@ -1,5 +1,6 @@
467 ;
468 ;Copyright 2016 Jay Sorg
469 +;Copyright 2017 mirabilos
470 ;
471 ;Permission to use, copy, modify, distribute, and sell this software and its
472 ;documentation for any purpose is hereby granted without fee, provided that
473 @@ -21,6 +22,29 @@
474
475 %ifidn __OUTPUT_FORMAT__,elf
476 section .note.GNU-stack noalloc noexec nowrite progbits
477 +%ifdef PIC
478 +section .text
479 +extern _GLOBAL_OFFSET_TABLE_
480 +.get_GOT:
481 + mov ebx,dword ptr [esp]
482 + ret
483 +%define lsym(name) ebx + name wrt ..gotoff
484 +%macro get_GOT 0
485 + call .get_GOT
486 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
487 +%endmacro
488 +%else
489 +%endif
490 +%else
491 +; not ELF
492 +%ifdef PIC
493 +%error Position-Independent Code is currently only supported for ELF
494 +%endif
495 +%endif
496 +%ifndef lsym
497 +%define lsym(name) name
498 +%macro get_GOT 0
499 +%endmacro
500 %endif
501
502 section .data
503 @@ -68,15 +92,15 @@ loop1a:
504 movdqa xmm2, [esi + 16]
505 movdqa xmm6, xmm1
506 movdqa xmm7, xmm2
507 - pand xmm1, [cdFFFF]
508 - pand xmm2, [cdFFFF]
509 + pand xmm1, [lsym(cdFFFF)]
510 + pand xmm2, [lsym(cdFFFF)]
511 packusdw xmm1, xmm2
512 movdqa xmm2, xmm6 ; src[2n + 1]
513 movdqa xmm3, xmm7
514 psrldq xmm2, 2
515 psrldq xmm3, 2
516 - pand xmm2, [cdFFFF]
517 - pand xmm3, [cdFFFF]
518 + pand xmm2, [lsym(cdFFFF)]
519 + pand xmm3, [lsym(cdFFFF)]
520 packusdw xmm2, xmm3
521 movdqa xmm3, xmm6 ; src[2n + 2]
522 movdqa xmm4, xmm7
523 @@ -90,8 +114,8 @@ loop1a:
524 psrldq xmm5, 12
525 pslldq xmm5, 12
526 por xmm4, xmm5
527 - pand xmm3, [cdFFFF]
528 - pand xmm4, [cdFFFF]
529 + pand xmm3, [lsym(cdFFFF)]
530 + pand xmm4, [lsym(cdFFFF)]
531 packusdw xmm3, xmm4
532 movdqa xmm4, xmm1
533 movdqa xmm5, xmm2
534 @@ -248,15 +272,15 @@ loop1c:
535 movdqa xmm2, [esi + 16]
536 movdqa xmm6, xmm1
537 movdqa xmm7, xmm2
538 - pand xmm1, [cdFFFF]
539 - pand xmm2, [cdFFFF]
540 + pand xmm1, [lsym(cdFFFF)]
541 + pand xmm2, [lsym(cdFFFF)]
542 packusdw xmm1, xmm2
543 movdqa xmm2, xmm6 ; src[2n + 1]
544 movdqa xmm3, xmm7
545 psrldq xmm2, 2
546 psrldq xmm3, 2
547 - pand xmm2, [cdFFFF]
548 - pand xmm3, [cdFFFF]
549 + pand xmm2, [lsym(cdFFFF)]
550 + pand xmm3, [lsym(cdFFFF)]
551 packusdw xmm2, xmm3
552 movdqa xmm3, xmm6 ; src[2n + 2]
553 movdqa xmm4, xmm7
554 @@ -270,8 +294,8 @@ loop1c:
555 movd xmm5, eax
556 pslldq xmm5, 12
557 por xmm4, xmm5
558 - pand xmm3, [cdFFFF]
559 - pand xmm4, [cdFFFF]
560 + pand xmm3, [lsym(cdFFFF)]
561 + pand xmm4, [lsym(cdFFFF)]
562 packusdw xmm3, xmm4
563 movdqa xmm4, xmm1
564 movdqa xmm5, xmm2
565 @@ -317,15 +341,15 @@ loop1c:
566 movdqa xmm2, [esi + 16]
567 movdqa xmm6, xmm1
568 movdqa xmm7, xmm2
569 - pand xmm1, [cdFFFF]
570 - pand xmm2, [cdFFFF]
571 + pand xmm1, [lsym(cdFFFF)]
572 + pand xmm2, [lsym(cdFFFF)]
573 packusdw xmm1, xmm2
574 movdqa xmm2, xmm6 ; src[2n + 1]
575 movdqa xmm3, xmm7
576 psrldq xmm2, 2
577 psrldq xmm3, 2
578 - pand xmm2, [cdFFFF]
579 - pand xmm3, [cdFFFF]
580 + pand xmm2, [lsym(cdFFFF)]
581 + pand xmm3, [lsym(cdFFFF)]
582 packusdw xmm2, xmm3
583 movdqa xmm3, xmm6 ; src[2n + 2]
584 movdqa xmm4, xmm7
585 @@ -339,8 +363,8 @@ loop1c:
586 psrldq xmm5, 12
587 pslldq xmm5, 12
588 por xmm4, xmm5
589 - pand xmm3, [cdFFFF]
590 - pand xmm4, [cdFFFF]
591 + pand xmm3, [lsym(cdFFFF)]
592 + pand xmm4, [lsym(cdFFFF)]
593 packusdw xmm3, xmm4
594 movdqa xmm4, xmm1
595 movdqa xmm5, xmm2
596 @@ -400,15 +424,15 @@ loop1c1:
597 movdqa xmm2, [esi + 16]
598 movdqa xmm6, xmm1
599 movdqa xmm7, xmm2
600 - pand xmm1, [cdFFFF]
601 - pand xmm2, [cdFFFF]
602 + pand xmm1, [lsym(cdFFFF)]
603 + pand xmm2, [lsym(cdFFFF)]
604 packusdw xmm1, xmm2
605 movdqa xmm2, xmm6 ; src[2n + 1]
606 movdqa xmm3, xmm7
607 psrldq xmm2, 2
608 psrldq xmm3, 2
609 - pand xmm2, [cdFFFF]
610 - pand xmm3, [cdFFFF]
611 + pand xmm2, [lsym(cdFFFF)]
612 + pand xmm3, [lsym(cdFFFF)]
613 packusdw xmm2, xmm3
614 movdqa xmm3, xmm6 ; src[2n + 2]
615 movdqa xmm4, xmm7
616 @@ -422,8 +446,8 @@ loop1c1:
617 movd xmm5, eax
618 pslldq xmm5, 12
619 por xmm4, xmm5
620 - pand xmm3, [cdFFFF]
621 - pand xmm4, [cdFFFF]
622 + pand xmm3, [lsym(cdFFFF)]
623 + pand xmm4, [lsym(cdFFFF)]
624 packusdw xmm3, xmm4
625 movdqa xmm4, xmm1
626 movdqa xmm5, xmm2
627 @@ -466,15 +490,15 @@ loop1c1:
628 movdqa xmm2, [esi + 16]
629 movdqa xmm6, xmm1
630 movdqa xmm7, xmm2
631 - pand xmm1, [cdFFFF]
632 - pand xmm2, [cdFFFF]
633 + pand xmm1, [lsym(cdFFFF)]
634 + pand xmm2, [lsym(cdFFFF)]
635 packusdw xmm1, xmm2
636 movdqa xmm2, xmm6 ; src[2n + 1]
637 movdqa xmm3, xmm7
638 psrldq xmm2, 2
639 psrldq xmm3, 2
640 - pand xmm2, [cdFFFF]
641 - pand xmm3, [cdFFFF]
642 + pand xmm2, [lsym(cdFFFF)]
643 + pand xmm3, [lsym(cdFFFF)]
644 packusdw xmm2, xmm3
645 movdqa xmm3, xmm6 ; src[2n + 2]
646 movdqa xmm4, xmm7
647 @@ -488,8 +512,8 @@ loop1c1:
648 psrldq xmm5, 12
649 pslldq xmm5, 12
650 por xmm4, xmm5
651 - pand xmm3, [cdFFFF]
652 - pand xmm4, [cdFFFF]
653 + pand xmm3, [lsym(cdFFFF)]
654 + pand xmm4, [lsym(cdFFFF)]
655 packusdw xmm3, xmm4
656 movdqa xmm4, xmm1
657 movdqa xmm5, xmm2
658 @@ -643,15 +667,15 @@ loop1e:
659 movdqa xmm2, [esi + 16]
660 movdqa xmm6, xmm1
661 movdqa xmm7, xmm2
662 - pand xmm1, [cdFFFF]
663 - pand xmm2, [cdFFFF]
664 + pand xmm1, [lsym(cdFFFF)]
665 + pand xmm2, [lsym(cdFFFF)]
666 packusdw xmm1, xmm2
667 movdqa xmm2, xmm6 ; src[2n + 1]
668 movdqa xmm3, xmm7
669 psrldq xmm2, 2
670 psrldq xmm3, 2
671 - pand xmm2, [cdFFFF]
672 - pand xmm3, [cdFFFF]
673 + pand xmm2, [lsym(cdFFFF)]
674 + pand xmm3, [lsym(cdFFFF)]
675 packusdw xmm2, xmm3
676 movdqa xmm3, xmm6 ; src[2n + 2]
677 movdqa xmm4, xmm7
678 @@ -665,8 +689,8 @@ loop1e:
679 movd xmm5, eax
680 pslldq xmm5, 12
681 por xmm4, xmm5
682 - pand xmm3, [cdFFFF]
683 - pand xmm4, [cdFFFF]
684 + pand xmm3, [lsym(cdFFFF)]
685 + pand xmm4, [lsym(cdFFFF)]
686 packusdw xmm3, xmm4
687 movdqa xmm4, xmm1
688 movdqa xmm5, xmm2
689 @@ -715,15 +739,15 @@ loop2e:
690 movdqa xmm2, [esi + 16]
691 movdqa xmm6, xmm1
692 movdqa xmm7, xmm2
693 - pand xmm1, [cdFFFF]
694 - pand xmm2, [cdFFFF]
695 + pand xmm1, [lsym(cdFFFF)]
696 + pand xmm2, [lsym(cdFFFF)]
697 packusdw xmm1, xmm2
698 movdqa xmm2, xmm6 ; src[2n + 1]
699 movdqa xmm3, xmm7
700 psrldq xmm2, 2
701 psrldq xmm3, 2
702 - pand xmm2, [cdFFFF]
703 - pand xmm3, [cdFFFF]
704 + pand xmm2, [lsym(cdFFFF)]
705 + pand xmm3, [lsym(cdFFFF)]
706 packusdw xmm2, xmm3
707 movdqa xmm3, xmm6 ; src[2n + 2]
708 movdqa xmm4, xmm7
709 @@ -737,8 +761,8 @@ loop2e:
710 movd xmm5, eax
711 pslldq xmm5, 12
712 por xmm4, xmm5
713 - pand xmm3, [cdFFFF]
714 - pand xmm4, [cdFFFF]
715 + pand xmm3, [lsym(cdFFFF)]
716 + pand xmm4, [lsym(cdFFFF)]
717 packusdw xmm3, xmm4
718 movdqa xmm4, xmm1
719 movdqa xmm5, xmm2
720 @@ -786,15 +810,15 @@ loop2e:
721 movdqa xmm2, [esi + 16]
722 movdqa xmm6, xmm1
723 movdqa xmm7, xmm2
724 - pand xmm1, [cdFFFF]
725 - pand xmm2, [cdFFFF]
726 + pand xmm1, [lsym(cdFFFF)]
727 + pand xmm2, [lsym(cdFFFF)]
728 packusdw xmm1, xmm2
729 movdqa xmm2, xmm6 ; src[2n + 1]
730 movdqa xmm3, xmm7
731 psrldq xmm2, 2
732 psrldq xmm3, 2
733 - pand xmm2, [cdFFFF]
734 - pand xmm3, [cdFFFF]
735 + pand xmm2, [lsym(cdFFFF)]
736 + pand xmm3, [lsym(cdFFFF)]
737 packusdw xmm2, xmm3
738 movdqa xmm3, xmm6 ; src[2n + 2]
739 movdqa xmm4, xmm7
740 @@ -808,8 +832,8 @@ loop2e:
741 psrldq xmm5, 12
742 pslldq xmm5, 12
743 por xmm4, xmm5
744 - pand xmm3, [cdFFFF]
745 - pand xmm4, [cdFFFF]
746 + pand xmm3, [lsym(cdFFFF)]
747 + pand xmm4, [lsym(cdFFFF)]
748 packusdw xmm3, xmm4
749 movdqa xmm4, xmm1
750 movdqa xmm5, xmm2
751 @@ -869,15 +893,15 @@ loop1e1:
752 movdqa xmm2, [esi + 16]
753 movdqa xmm6, xmm1
754 movdqa xmm7, xmm2
755 - pand xmm1, [cdFFFF]
756 - pand xmm2, [cdFFFF]
757 + pand xmm1, [lsym(cdFFFF)]
758 + pand xmm2, [lsym(cdFFFF)]
759 packusdw xmm1, xmm2
760 movdqa xmm2, xmm6 ; src[2n + 1]
761 movdqa xmm3, xmm7
762 psrldq xmm2, 2
763 psrldq xmm3, 2
764 - pand xmm2, [cdFFFF]
765 - pand xmm3, [cdFFFF]
766 + pand xmm2, [lsym(cdFFFF)]
767 + pand xmm3, [lsym(cdFFFF)]
768 packusdw xmm2, xmm3
769 movdqa xmm3, xmm6 ; src[2n + 2]
770 movdqa xmm4, xmm7
771 @@ -891,8 +915,8 @@ loop1e1:
772 movd xmm5, eax
773 pslldq xmm5, 12
774 por xmm4, xmm5
775 - pand xmm3, [cdFFFF]
776 - pand xmm4, [cdFFFF]
777 + pand xmm3, [lsym(cdFFFF)]
778 + pand xmm4, [lsym(cdFFFF)]
779 packusdw xmm3, xmm4
780 movdqa xmm4, xmm1
781 movdqa xmm5, xmm2
782 @@ -938,15 +962,15 @@ loop2e1:
783 movdqa xmm2, [esi + 16]
784 movdqa xmm6, xmm1
785 movdqa xmm7, xmm2
786 - pand xmm1, [cdFFFF]
787 - pand xmm2, [cdFFFF]
788 + pand xmm1, [lsym(cdFFFF)]
789 + pand xmm2, [lsym(cdFFFF)]
790 packusdw xmm1, xmm2
791 movdqa xmm2, xmm6 ; src[2n + 1]
792 movdqa xmm3, xmm7
793 psrldq xmm2, 2
794 psrldq xmm3, 2
795 - pand xmm2, [cdFFFF]
796 - pand xmm3, [cdFFFF]
797 + pand xmm2, [lsym(cdFFFF)]
798 + pand xmm3, [lsym(cdFFFF)]
799 packusdw xmm2, xmm3
800 movdqa xmm3, xmm6 ; src[2n + 2]
801 movdqa xmm4, xmm7
802 @@ -960,8 +984,8 @@ loop2e1:
803 movd xmm5, eax
804 pslldq xmm5, 12
805 por xmm4, xmm5
806 - pand xmm3, [cdFFFF]
807 - pand xmm4, [cdFFFF]
808 + pand xmm3, [lsym(cdFFFF)]
809 + pand xmm4, [lsym(cdFFFF)]
810 packusdw xmm3, xmm4
811 movdqa xmm4, xmm1
812 movdqa xmm5, xmm2
813 @@ -1006,15 +1030,15 @@ loop2e1:
814 movdqa xmm2, [esi + 16]
815 movdqa xmm6, xmm1
816 movdqa xmm7, xmm2
817 - pand xmm1, [cdFFFF]
818 - pand xmm2, [cdFFFF]
819 + pand xmm1, [lsym(cdFFFF)]
820 + pand xmm2, [lsym(cdFFFF)]
821 packusdw xmm1, xmm2
822 movdqa xmm2, xmm6 ; src[2n + 1]
823 movdqa xmm3, xmm7
824 psrldq xmm2, 2
825 psrldq xmm3, 2
826 - pand xmm2, [cdFFFF]
827 - pand xmm3, [cdFFFF]
828 + pand xmm2, [lsym(cdFFFF)]
829 + pand xmm3, [lsym(cdFFFF)]
830 packusdw xmm2, xmm3
831 movdqa xmm3, xmm6 ; src[2n + 2]
832 movdqa xmm4, xmm7
833 @@ -1028,8 +1052,8 @@ loop2e1:
834 psrldq xmm5, 12
835 pslldq xmm5, 12
836 por xmm4, xmm5
837 - pand xmm3, [cdFFFF]
838 - pand xmm4, [cdFFFF]
839 + pand xmm3, [lsym(cdFFFF)]
840 + pand xmm4, [lsym(cdFFFF)]
841 packusdw xmm3, xmm4
842 movdqa xmm4, xmm1
843 movdqa xmm5, xmm2
844 @@ -1088,9 +1112,9 @@ loop1f:
845 punpcklbw xmm1, xmm0
846 punpcklbw xmm2, xmm0
847 punpcklbw xmm3, xmm0
848 - psubw xmm1, [cw128]
849 - psubw xmm2, [cw128]
850 - psubw xmm3, [cw128]
851 + psubw xmm1, [lsym(cw128)]
852 + psubw xmm2, [lsym(cw128)]
853 + psubw xmm3, [lsym(cw128)]
854 psllw xmm1, 5
855 psllw xmm2, 5
856 psllw xmm3, 5
857 @@ -1122,8 +1146,8 @@ loop2f:
858 movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
859 punpcklbw xmm2, xmm0
860 punpcklbw xmm3, xmm0
861 - psubw xmm2, [cw128]
862 - psubw xmm3, [cw128]
863 + psubw xmm2, [lsym(cw128)]
864 + psubw xmm3, [lsym(cw128)]
865 psllw xmm2, 5
866 psllw xmm3, 5
867 movdqa xmm4, xmm1
868 @@ -1155,7 +1179,7 @@ loop2f:
869 movdqa xmm1, xmm3 ; src[2n]
870 movq xmm2, [esi + 64 * 1] ; src[2n + 1]
871 punpcklbw xmm2, xmm0
872 - psubw xmm2, [cw128]
873 + psubw xmm2, [lsym(cw128)]
874 psllw xmm2, 5
875 movdqa xmm4, xmm1
876 movdqa xmm5, xmm2
877 @@ -1196,7 +1220,7 @@ set_quants_hi:
878 movd xmm1, eax
879 movdqa LHI_SFT, xmm1
880 imul eax, 16
881 - lea edx, [cwa0]
882 + lea edx, [lsym(cwa0)]
883 add edx, eax
884 movdqa xmm1, [edx]
885 movdqa LHI_ADD, xmm1
886 @@ -1207,7 +1231,7 @@ set_quants_lo:
887 movd xmm1, eax
888 movdqa LLO_SFT, xmm1
889 imul eax, 16
890 - lea edx, [cwa0]
891 + lea edx, [lsym(cwa0)]
892 add edx, eax
893 movdqa xmm1, [edx]
894 movdqa LLO_ADD, xmm1
895 @@ -1243,6 +1267,7 @@ PROC _rfxcodec_encode_dwt_shift_x86_sse4
896 movdqu [esp], xmm0
897 ; save registers
898 push ebx
899 + get_GOT
900 push esi
901 push edi
902 push ebp
903 --- a/xorgxrdp/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm
904 +++ b/xorgxrdp/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm
905 @@ -1,5 +1,6 @@
906 ;
907 ;Copyright 2014 Jay Sorg
908 +;Copyright 2017 mirabilos
909 ;
910 ;Permission to use, copy, modify, distribute, and sell this software and its
911 ;documentation for any purpose is hereby granted without fee, provided that
912 @@ -22,7 +23,30 @@
913 ;
914
915 %ifidn __OUTPUT_FORMAT__,elf
916 -SECTION .note.GNU-stack noalloc noexec nowrite progbits
917 +section .note.GNU-stack noalloc noexec nowrite progbits
918 +%ifdef PIC
919 +section .text
920 +extern _GLOBAL_OFFSET_TABLE_
921 +.get_GOT:
922 + mov ebx,dword ptr [esp]
923 + ret
924 +%define lsym(name) ebx + name wrt ..gotoff
925 +%macro get_GOT 0
926 + call .get_GOT
927 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
928 +%endmacro
929 +%else
930 +%endif
931 +%else
932 +; not ELF
933 +%ifdef PIC
934 +%error Position-Independent Code is currently only supported for ELF
935 +%endif
936 +%endif
937 +%ifndef lsym
938 +%define lsym(name) name
939 +%macro get_GOT 0
940 +%endmacro
941 %endif
942
943 SECTION .data
944 @@ -49,13 +73,14 @@ PROC a8r8g8b8_to_a8b8g8r8_box_x86_sse2
945 PROC _a8r8g8b8_to_a8b8g8r8_box_x86_sse2
946 %endif
947 push ebx
948 + get_GOT
949 push esi
950 push edi
951 push ebp
952
953 - movdqa xmm4, [c1]
954 - movdqa xmm5, [c2]
955 - movdqa xmm6, [c3]
956 + movdqa xmm4, [lsym(c1)]
957 + movdqa xmm5, [lsym(c2)]
958 + movdqa xmm6, [lsym(c3)]
959
960 mov esi, [esp + 20] ; src
961 mov edi, [esp + 28] ; dst
962 --- a/xorgxrdp/module/x86/a8r8g8b8_to_nv12_box_x86_sse2.asm
963 +++ b/xorgxrdp/module/x86/a8r8g8b8_to_nv12_box_x86_sse2.asm
964 @@ -1,5 +1,6 @@
965 ;
966 ;Copyright 2015 Jay Sorg
967 +;Copyright 2017 mirabilos
968 ;
969 ;Permission to use, copy, modify, distribute, and sell this software and its
970 ;documentation for any purpose is hereby granted without fee, provided that
971 @@ -26,7 +27,30 @@
972 ; height should be even and > 0
973
974 %ifidn __OUTPUT_FORMAT__,elf
975 -SECTION .note.GNU-stack noalloc noexec nowrite progbits
976 +section .note.GNU-stack noalloc noexec nowrite progbits
977 +%ifdef PIC
978 +section .text
979 +extern _GLOBAL_OFFSET_TABLE_
980 +.get_GOT:
981 + mov ebx,dword ptr [esp]
982 + ret
983 +%define lsym(name) ebx + name wrt ..gotoff
984 +%macro get_GOT 0
985 + call .get_GOT
986 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
987 +%endmacro
988 +%else
989 +%endif
990 +%else
991 +; not ELF
992 +%ifdef PIC
993 +%error Position-Independent Code is currently only supported for ELF
994 +%endif
995 +%endif
996 +%ifndef lsym
997 +%define lsym(name) name
998 +%macro get_GOT 0
999 +%endmacro
1000 %endif
1001
1002 SECTION .data
1003 @@ -81,6 +105,7 @@ PROC a8r8g8b8_to_nv12_box_x86_sse2
1004 PROC _a8r8g8b8_to_nv12_box_x86_sse2
1005 %endif
1006 push ebx
1007 + get_GOT
1008 push esi
1009 push edi
1010 push ebp
1011 @@ -103,23 +128,23 @@ loop1:
1012 ; first line
1013 movdqu xmm0, [esi] ; 4 pixels, 16 bytes
1014 movdqa xmm1, xmm0 ; blue
1015 - pand xmm1, [cd255] ; blue
1016 + pand xmm1, [lsym(cd255)] ; blue
1017 movdqa xmm2, xmm0 ; green
1018 psrld xmm2, 8 ; green
1019 - pand xmm2, [cd255] ; green
1020 + pand xmm2, [lsym(cd255)] ; green
1021 movdqa xmm3, xmm0 ; red
1022 psrld xmm3, 16 ; red
1023 - pand xmm3, [cd255] ; red
1024 + pand xmm3, [lsym(cd255)] ; red
1025
1026 movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes
1027 movdqa xmm4, xmm0 ; blue
1028 - pand xmm4, [cd255] ; blue
1029 + pand xmm4, [lsym(cd255)] ; blue
1030 movdqa xmm5, xmm0 ; green
1031 psrld xmm5, 8 ; green
1032 - pand xmm5, [cd255] ; green
1033 + pand xmm5, [lsym(cd255)] ; green
1034 movdqa xmm6, xmm0 ; red
1035 psrld xmm6, 16 ; red
1036 - pand xmm6, [cd255] ; red
1037 + pand xmm6, [lsym(cd255)] ; red
1038
1039 packssdw xmm1, xmm4 ; xmm1 = 8 blues
1040 packssdw xmm2, xmm5 ; xmm2 = 8 greens
1041 @@ -129,14 +154,14 @@ loop1:
1042 movdqa xmm4, xmm1 ; blue
1043 movdqa xmm5, xmm2 ; green
1044 movdqa xmm6, xmm3 ; red
1045 - pmullw xmm4, [cw25]
1046 - pmullw xmm5, [cw129]
1047 - pmullw xmm6, [cw66]
1048 + pmullw xmm4, [lsym(cw25)]
1049 + pmullw xmm5, [lsym(cw129)]
1050 + pmullw xmm6, [lsym(cw66)]
1051 paddw xmm4, xmm5
1052 paddw xmm4, xmm6
1053 - paddw xmm4, [cw128]
1054 + paddw xmm4, [lsym(cw128)]
1055 psrlw xmm4, 8
1056 - paddw xmm4, [cw16]
1057 + paddw xmm4, [lsym(cw16)]
1058 packuswb xmm4, xmm7
1059 movq [edi], xmm4 ; out 8 bytes yyyyyyyy
1060
1061 @@ -144,14 +169,14 @@ loop1:
1062 movdqa xmm4, xmm1 ; blue
1063 movdqa xmm5, xmm2 ; green
1064 movdqa xmm6, xmm3 ; red
1065 - pmullw xmm4, [cw112]
1066 - pmullw xmm5, [cw74]
1067 - pmullw xmm6, [cw38]
1068 + pmullw xmm4, [lsym(cw112)]
1069 + pmullw xmm5, [lsym(cw74)]
1070 + pmullw xmm6, [lsym(cw38)]
1071 psubw xmm4, xmm5
1072 psubw xmm4, xmm6
1073 - paddw xmm4, [cw128]
1074 + paddw xmm4, [lsym(cw128)]
1075 psraw xmm4, 8
1076 - paddw xmm4, [cw128]
1077 + paddw xmm4, [lsym(cw128)]
1078 packuswb xmm4, xmm7
1079 movq LU1, xmm4 ; save for later
1080
1081 @@ -159,14 +184,14 @@ loop1:
1082 movdqa xmm6, xmm1 ; blue
1083 movdqa xmm5, xmm2 ; green
1084 movdqa xmm4, xmm3 ; red
1085 - pmullw xmm4, [cw112]
1086 - pmullw xmm5, [cw94]
1087 - pmullw xmm6, [cw18]
1088 + pmullw xmm4, [lsym(cw112)]
1089 + pmullw xmm5, [lsym(cw94)]
1090 + pmullw xmm6, [lsym(cw18)]
1091 psubw xmm4, xmm5
1092 psubw xmm4, xmm6
1093 - paddw xmm4, [cw128]
1094 + paddw xmm4, [lsym(cw128)]
1095 psraw xmm4, 8
1096 - paddw xmm4, [cw128]
1097 + paddw xmm4, [lsym(cw128)]
1098 packuswb xmm4, xmm7
1099 movq LV1, xmm4 ; save for later
1100
1101 @@ -177,23 +202,23 @@ loop1:
1102 ; second line
1103 movdqu xmm0, [esi] ; 4 pixels, 16 bytes
1104 movdqa xmm1, xmm0 ; blue
1105 - pand xmm1, [cd255] ; blue
1106 + pand xmm1, [lsym(cd255)] ; blue
1107 movdqa xmm2, xmm0 ; green
1108 psrld xmm2, 8 ; green
1109 - pand xmm2, [cd255] ; green
1110 + pand xmm2, [lsym(cd255)] ; green
1111 movdqa xmm3, xmm0 ; red
1112 psrld xmm3, 16 ; red
1113 - pand xmm3, [cd255] ; red
1114 + pand xmm3, [lsym(cd255)] ; red
1115
1116 movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes
1117 movdqa xmm4, xmm0 ; blue
1118 - pand xmm4, [cd255] ; blue
1119 + pand xmm4, [lsym(cd255)] ; blue
1120 movdqa xmm5, xmm0 ; green
1121 psrld xmm5, 8 ; green
1122 - pand xmm5, [cd255] ; green
1123 + pand xmm5, [lsym(cd255)] ; green
1124 movdqa xmm6, xmm0 ; red
1125 psrld xmm6, 16 ; red
1126 - pand xmm6, [cd255] ; red
1127 + pand xmm6, [lsym(cd255)] ; red
1128
1129 packssdw xmm1, xmm4 ; xmm1 = 8 blues
1130 packssdw xmm2, xmm5 ; xmm2 = 8 greens
1131 @@ -203,14 +228,14 @@ loop1:
1132 movdqa xmm4, xmm1 ; blue
1133 movdqa xmm5, xmm2 ; green
1134 movdqa xmm6, xmm3 ; red
1135 - pmullw xmm4, [cw25]
1136 - pmullw xmm5, [cw129]
1137 - pmullw xmm6, [cw66]
1138 + pmullw xmm4, [lsym(cw25)]
1139 + pmullw xmm5, [lsym(cw129)]
1140 + pmullw xmm6, [lsym(cw66)]
1141 paddw xmm4, xmm5
1142 paddw xmm4, xmm6
1143 - paddw xmm4, [cw128]
1144 + paddw xmm4, [lsym(cw128)]
1145 psrlw xmm4, 8
1146 - paddw xmm4, [cw16]
1147 + paddw xmm4, [lsym(cw16)]
1148 packuswb xmm4, xmm7
1149 movq [edi], xmm4 ; out 8 bytes yyyyyyyy
1150
1151 @@ -218,14 +243,14 @@ loop1:
1152 movdqa xmm4, xmm1 ; blue
1153 movdqa xmm5, xmm2 ; green
1154 movdqa xmm6, xmm3 ; red
1155 - pmullw xmm4, [cw112]
1156 - pmullw xmm5, [cw74]
1157 - pmullw xmm6, [cw38]
1158 + pmullw xmm4, [lsym(cw112)]
1159 + pmullw xmm5, [lsym(cw74)]
1160 + pmullw xmm6, [lsym(cw38)]
1161 psubw xmm4, xmm5
1162 psubw xmm4, xmm6
1163 - paddw xmm4, [cw128]
1164 + paddw xmm4, [lsym(cw128)]
1165 psraw xmm4, 8
1166 - paddw xmm4, [cw128]
1167 + paddw xmm4, [lsym(cw128)]
1168 packuswb xmm4, xmm7
1169 movq LU2, xmm4 ; save for later
1170
1171 @@ -233,48 +258,48 @@ loop1:
1172 movdqa xmm6, xmm1 ; blue
1173 movdqa xmm5, xmm2 ; green
1174 movdqa xmm4, xmm3 ; red
1175 - pmullw xmm4, [cw112]
1176 - pmullw xmm5, [cw94]
1177 - pmullw xmm6, [cw18]
1178 + pmullw xmm4, [lsym(cw112)]
1179 + pmullw xmm5, [lsym(cw94)]
1180 + pmullw xmm6, [lsym(cw18)]
1181 psubw xmm4, xmm5
1182 psubw xmm4, xmm6
1183 - paddw xmm4, [cw128]
1184 + paddw xmm4, [lsym(cw128)]
1185 psraw xmm4, 8
1186 - paddw xmm4, [cw128]
1187 + paddw xmm4, [lsym(cw128)]
1188 packuswb xmm4, xmm7
1189 movq LV2, xmm4 ; save for later
1190
1191 ; uv add and divide(average)
1192 movq mm1, LU1 ; u from first line
1193 movq mm3, mm1
1194 - pand mm1, [cw255]
1195 + pand mm1, [lsym(cw255)]
1196 psrlw mm3, 8
1197 - pand mm3, [cw255]
1198 + pand mm3, [lsym(cw255)]
1199 paddw mm1, mm3 ; add
1200 movq mm2, LU2 ; u from second line
1201 movq mm3, mm2
1202 - pand mm2, [cw255]
1203 + pand mm2, [lsym(cw255)]
1204 paddw mm1, mm2 ; add
1205 psrlw mm3, 8
1206 - pand mm3, [cw255]
1207 + pand mm3, [lsym(cw255)]
1208 paddw mm1, mm3 ; add
1209 - paddw mm1, [cw2] ; add 2
1210 + paddw mm1, [lsym(cw2)] ; add 2
1211 psrlw mm1, 2 ; div 4
1212
1213 movq mm2, LV1 ; v from first line
1214 movq mm4, mm2
1215 - pand mm2, [cw255]
1216 + pand mm2, [lsym(cw255)]
1217 psrlw mm4, 8
1218 - pand mm4, [cw255]
1219 + pand mm4, [lsym(cw255)]
1220 paddw mm2, mm4 ; add
1221 movq mm3, LV2 ; v from second line
1222 movq mm4, mm3
1223 - pand mm3, [cw255]
1224 + pand mm3, [lsym(cw255)]
1225 paddw mm2, mm3 ; add
1226 psrlw mm4, 8
1227 - pand mm4, [cw255]
1228 + pand mm4, [lsym(cw255)]
1229 paddw mm2, mm4 ; add
1230 - paddw mm2, [cw2] ; add 2
1231 + paddw mm2, [lsym(cw2)] ; add 2
1232 psrlw mm2, 2 ; div 4
1233
1234 packuswb mm1, mm1
1235 --- a/xorgxrdp/module/x86/uyvy_to_rgb32_x86_sse2.asm
1236 +++ b/xorgxrdp/module/x86/uyvy_to_rgb32_x86_sse2.asm
1237 @@ -1,5 +1,6 @@
1238 ;
1239 ;Copyright 2014 Jay Sorg
1240 +;Copyright 2017 mirabilos
1241 ;
1242 ;Permission to use, copy, modify, distribute, and sell this software and its
1243 ;documentation for any purpose is hereby granted without fee, provided that
1244 @@ -34,7 +35,30 @@
1245 ; 4096 9324 0
1246
1247 %ifidn __OUTPUT_FORMAT__,elf
1248 -SECTION .note.GNU-stack noalloc noexec nowrite progbits
1249 +section .note.GNU-stack noalloc noexec nowrite progbits
1250 +%ifdef PIC
1251 +section .text
1252 +extern _GLOBAL_OFFSET_TABLE_
1253 +.get_GOT:
1254 + mov ebx,dword ptr [esp]
1255 + ret
1256 +%define lsym(name) ebx + name wrt ..gotoff
1257 +%macro get_GOT 0
1258 + call .get_GOT
1259 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
1260 +%endmacro
1261 +%else
1262 +%endif
1263 +%else
1264 +; not ELF
1265 +%ifdef PIC
1266 +%error Position-Independent Code is currently only supported for ELF
1267 +%endif
1268 +%endif
1269 +%ifndef lsym
1270 +%define lsym(name) name
1271 +%macro get_GOT 0
1272 +%endmacro
1273 %endif
1274
1275 SECTION .data
1276 @@ -62,6 +86,7 @@ PROC uyvy_to_rgb32_x86_sse2
1277 PROC _uyvy_to_rgb32_x86_sse2
1278 %endif
1279 push ebx
1280 + get_GOT
1281 push esi
1282 push edi
1283 push ebp
1284 @@ -76,7 +101,7 @@ PROC _uyvy_to_rgb32_x86_sse2
1285
1286 mov ecx, eax
1287
1288 - movdqa xmm7, [c128]
1289 + movdqa xmm7, [lsym(c128)]
1290
1291 loop1:
1292 ; hi lo
1293 @@ -113,22 +138,22 @@ loop1:
1294 psllw xmm2, 4
1295
1296 ; r = y + hiword(4669 * (v << 4))
1297 - movdqa xmm4, [c4669]
1298 + movdqa xmm4, [lsym(c4669)]
1299 pmulhw xmm4, xmm1
1300 movdqa xmm3, xmm0
1301 paddw xmm3, xmm4
1302
1303 ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
1304 - movdqa xmm5, [c1616]
1305 + movdqa xmm5, [lsym(c1616)]
1306 pmulhw xmm5, xmm2
1307 - movdqa xmm6, [c2378]
1308 + movdqa xmm6, [lsym(c2378)]
1309 pmulhw xmm6, xmm1
1310 movdqa xmm4, xmm0
1311 psubw xmm4, xmm5
1312 psubw xmm4, xmm6
1313
1314 ; b = y + hiword(9324 * (u << 4))
1315 - movdqa xmm6, [c9324]
1316 + movdqa xmm6, [lsym(c9324)]
1317 pmulhw xmm6, xmm2
1318 movdqa xmm5, xmm0
1319 paddw xmm5, xmm6
1320 --- a/xorgxrdp/module/x86/yuy2_to_rgb32_x86_sse2.asm
1321 +++ b/xorgxrdp/module/x86/yuy2_to_rgb32_x86_sse2.asm
1322 @@ -1,5 +1,6 @@
1323 ;
1324 ;Copyright 2014 Jay Sorg
1325 +;Copyright 2017 mirabilos
1326 ;
1327 ;Permission to use, copy, modify, distribute, and sell this software and its
1328 ;documentation for any purpose is hereby granted without fee, provided that
1329 @@ -34,7 +35,30 @@
1330 ; 4096 9324 0
1331
1332 %ifidn __OUTPUT_FORMAT__,elf
1333 -SECTION .note.GNU-stack noalloc noexec nowrite progbits
1334 +section .note.GNU-stack noalloc noexec nowrite progbits
1335 +%ifdef PIC
1336 +section .text
1337 +extern _GLOBAL_OFFSET_TABLE_
1338 +.get_GOT:
1339 + mov ebx,dword ptr [esp]
1340 + ret
1341 +%define lsym(name) ebx + name wrt ..gotoff
1342 +%macro get_GOT 0
1343 + call .get_GOT
1344 + add ebx,_GLOBAL_OFFSET_TABLE_+$$-.get_GOT wrt ..gotpc
1345 +%endmacro
1346 +%else
1347 +%endif
1348 +%else
1349 +; not ELF
1350 +%ifdef PIC
1351 +%error Position-Independent Code is currently only supported for ELF
1352 +%endif
1353 +%endif
1354 +%ifndef lsym
1355 +%define lsym(name) name
1356 +%macro get_GOT 0
1357 +%endmacro
1358 %endif
1359
1360 SECTION .data
1361 @@ -62,6 +86,7 @@ PROC yuy2_to_rgb32_x86_sse2
1362 PROC _yuy2_to_rgb32_x86_sse2
1363 %endif
1364 push ebx
1365 + get_GOT
1366 push esi
1367 push edi
1368 push ebp
1369 @@ -76,7 +101,7 @@ PROC _yuy2_to_rgb32_x86_sse2
1370
1371 mov ecx, eax
1372
1373 - movdqa xmm7, [c128]
1374 + movdqa xmm7, [lsym(c128)]
1375
1376 loop1:
1377 ; hi lo
1378 @@ -113,22 +138,22 @@ loop1:
1379 psllw xmm2, 4
1380
1381 ; r = y + hiword(4669 * (v << 4))
1382 - movdqa xmm4, [c4669]
1383 + movdqa xmm4, [lsym(c4669)]
1384 pmulhw xmm4, xmm1
1385 movdqa xmm3, xmm0
1386 paddw xmm3, xmm4
1387
1388 ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
1389 - movdqa xmm5, [c1616]
1390 + movdqa xmm5, [lsym(c1616)]
1391 pmulhw xmm5, xmm2
1392 - movdqa xmm6, [c2378]
1393 + movdqa xmm6, [lsym(c2378)]
1394 pmulhw xmm6, xmm1
1395 movdqa xmm4, xmm0
1396 psubw xmm4, xmm5
1397 psubw xmm4, xmm6
1398
1399 ; b = y + hiword(9324 * (u << 4))
1400 - movdqa xmm6, [c9324]
1401 + movdqa xmm6, [lsym(c9324)]
1402 pmulhw xmm6, xmm2
1403 movdqa xmm5, xmm0
1404 paddw xmm5, xmm6
77 systemd.diff
88 lfs.diff
99 i386-pic-asm-part1.diff
10 i386-pic-asm-part2.diff
2323
2424 ifneq (${DEB_BUILD_GNU_TYPE},${DEB_HOST_GNU_TYPE})
2525 CONFIGURE_ARGS+= --host=${DEB_HOST_GNU_TYPE}
26 endif
27
28 ifeq (i386,${DEB_HOST_ARCH_CPU})
29 # The following files must be rewritten to use PIC if -DPIC is passed:
30 # - [F] librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
31 # - [F] librfxcodec/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
32 # - [F] xorgxrdp/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm
33 # - [F] xorgxrdp/module/x86/a8r8g8b8_to_nv12_box_x86_sse2.asm
34 # - [!] xorgxrdp/module/x86/i420_to_rgb32_x86_sse2.asm
35 # - [f] xorgxrdp/module/x86/uyvy_to_rgb32_x86_sse2.asm
36 # - [f] xorgxrdp/module/x86/yuy2_to_rgb32_x86_sse2.asm
37 # - [!] xorgxrdp/module/x86/yv12_to_rgb32_x86_sse2.asm
38 # Key: F = ebx freed (f = was already free), ! = excluded from C for now
39 # Documentation: http://www.nasm.us/doc/nasmdoc9.html#section-9.2
40 # Unfortunately, this requires reserving the EBX register, which
41 # is used extensively by this code; to avoid crashes or security
42 # issues in xrdp, we disable the code on any-i386 for stretch and
43 # will revisit this later and with upstream; amd64 isn’t affected
44 # (as it uses RIP-relative addressing), x32 and other architectures
45 # don’t use the assembly code at all.
46 # When fixed, change Build-Depends’ nasm component to:
47 # nasm [amd64 hurd-i386 i386 kfreebsd-amd64 kfreebsd-i386],
48 CONFIGURE_ARGS+= --without-simd
4926 endif
5027
5128 ifeq (x32,${DEB_HOST_ARCH})