0 | 0 |
|
1 | |
,'''
|
2 | |
,,;,, ,,,, ,,,,, ,,, ,,
|
3 | |
; ; ; ; ; ;
|
4 | |
; ,''''; '''', ; ; ;
|
5 | |
; ',,,,;, ,,,,,' ; ; ;
|
6 | |
|
7 | |
flat assembler 1.71
|
8 | |
Programmer's Manual
|
|
1 |
,'''
|
|
2 |
,,;,, ,,,, ,,,,, ,,, ,,
|
|
3 |
; ; ; ; ; ;
|
|
4 |
; ,''''; '''', ; ; ;
|
|
5 |
; ',,,,;, ,,,,,' ; ; ;
|
|
6 |
|
|
7 |
flat assembler 1.71
|
|
8 |
Programmer's Manual
|
9 | 9 |
|
10 | 10 |
|
11 | 11 |
Table of contents
|
|
13 | 13 |
|
14 | 14 |
Chapter 1 Introduction
|
15 | 15 |
|
16 | |
1.1 Compiler overview
|
17 | |
1.1.1 System requirements
|
18 | |
1.1.2 Executing compiler from command line
|
19 | |
1.1.3 Compiler messages
|
20 | |
1.1.4 Output formats
|
21 | |
|
22 | |
1.2 Assembly syntax
|
23 | |
1.2.1 Instruction syntax
|
24 | |
1.2.2 Data definitions
|
25 | |
1.2.3 Constants and labels
|
26 | |
1.2.4 Numerical expressions
|
27 | |
1.2.5 Jumps and calls
|
28 | |
1.2.6 Size settings
|
|
16 |
1.1 Compiler overview
|
|
17 |
1.1.1 System requirements
|
|
18 |
1.1.2 Executing compiler from command line
|
|
19 |
1.1.3 Compiler messages
|
|
20 |
1.1.4 Output formats
|
|
21 |
|
|
22 |
1.2 Assembly syntax
|
|
23 |
1.2.1 Instruction syntax
|
|
24 |
1.2.2 Data definitions
|
|
25 |
1.2.3 Constants and labels
|
|
26 |
1.2.4 Numerical expressions
|
|
27 |
1.2.5 Jumps and calls
|
|
28 |
1.2.6 Size settings
|
29 | 29 |
|
30 | 30 |
Chapter 2 Instruction set
|
31 | 31 |
|
32 | |
2.1 The x86 architecture instructions
|
33 | |
2.1.1 Data movement instructions
|
34 | |
2.1.2 Type conversion instructions
|
35 | |
2.1.3 Binary arithmetic instructions
|
36 | |
2.1.4 Decimal arithmetic instructions
|
37 | |
2.1.5 Logical instructions
|
38 | |
2.1.6 Control transfer instructions
|
39 | |
2.1.7 I/O instructions
|
40 | |
2.1.8 Strings operations
|
41 | |
2.1.9 Flag control instructions
|
42 | |
2.1.10 Conditional operations
|
43 | |
2.1.11 Miscellaneous instructions
|
44 | |
2.1.12 System instructions
|
45 | |
2.1.13 FPU instructions
|
46 | |
2.1.14 MMX instructions
|
47 | |
2.1.15 SSE instructions
|
48 | |
2.1.16 SSE2 instructions
|
49 | |
2.1.17 SSE3 instructions
|
50 | |
2.1.18 AMD 3DNow! instructions
|
51 | |
2.1.19 The x86-64 long mode instructions
|
52 | |
2.1.20 SSE4 instructions
|
53 | |
2.1.21 AVX instructions
|
54 | |
2.1.22 AVX2 instructions
|
55 | |
2.1.23 Auxiliary sets of computational instructions
|
56 | |
2.1.24 AVX-512 instructions
|
57 | |
2.1.25 Other extensions of instruction set
|
58 | |
|
59 | |
2.2 Control directives
|
60 | |
2.2.1 Numerical constants
|
61 | |
2.2.2 Conditional assembly
|
62 | |
2.2.3 Repeating blocks of instructions
|
63 | |
2.2.4 Addressing spaces
|
64 | |
2.2.5 Other directives
|
65 | |
2.2.6 Multiple passes
|
66 | |
|
67 | |
2.3 Preprocessor directives
|
68 | |
2.3.1 Including source files
|
69 | |
2.3.2 Symbolic constants
|
70 | |
2.3.3 Macroinstructions
|
71 | |
2.3.4 Structures
|
72 | |
2.3.5 Repeating macroinstructions
|
73 | |
2.3.6 Conditional preprocessing
|
74 | |
2.3.7 Order of processing
|
75 | |
|
76 | |
2.4 Formatter directives
|
77 | |
2.4.1 MZ executable
|
78 | |
2.4.2 Portable Executable
|
79 | |
2.4.3 Common Object File Format
|
80 | |
2.4.4 Executable and Linkable Format
|
|
32 |
2.1 The x86 architecture instructions
|
|
33 |
2.1.1 Data movement instructions
|
|
34 |
2.1.2 Type conversion instructions
|
|
35 |
2.1.3 Binary arithmetic instructions
|
|
36 |
2.1.4 Decimal arithmetic instructions
|
|
37 |
2.1.5 Logical instructions
|
|
38 |
2.1.6 Control transfer instructions
|
|
39 |
2.1.7 I/O instructions
|
|
40 |
2.1.8 Strings operations
|
|
41 |
2.1.9 Flag control instructions
|
|
42 |
2.1.10 Conditional operations
|
|
43 |
2.1.11 Miscellaneous instructions
|
|
44 |
2.1.12 System instructions
|
|
45 |
2.1.13 FPU instructions
|
|
46 |
2.1.14 MMX instructions
|
|
47 |
2.1.15 SSE instructions
|
|
48 |
2.1.16 SSE2 instructions
|
|
49 |
2.1.17 SSE3 instructions
|
|
50 |
2.1.18 AMD 3DNow! instructions
|
|
51 |
2.1.19 The x86-64 long mode instructions
|
|
52 |
2.1.20 SSE4 instructions
|
|
53 |
2.1.21 AVX instructions
|
|
54 |
2.1.22 AVX2 instructions
|
|
55 |
2.1.23 Auxiliary sets of computational instructions
|
|
56 |
2.1.24 AVX-512 instructions
|
|
57 |
2.1.25 Other extensions of instruction set
|
|
58 |
|
|
59 |
2.2 Control directives
|
|
60 |
2.2.1 Numerical constants
|
|
61 |
2.2.2 Conditional assembly
|
|
62 |
2.2.3 Repeating blocks of instructions
|
|
63 |
2.2.4 Addressing spaces
|
|
64 |
2.2.5 Other directives
|
|
65 |
2.2.6 Multiple passes
|
|
66 |
|
|
67 |
2.3 Preprocessor directives
|
|
68 |
2.3.1 Including source files
|
|
69 |
2.3.2 Symbolic constants
|
|
70 |
2.3.3 Macroinstructions
|
|
71 |
2.3.4 Structures
|
|
72 |
2.3.5 Repeating macroinstructions
|
|
73 |
2.3.6 Conditional preprocessing
|
|
74 |
2.3.7 Order of processing
|
|
75 |
|
|
76 |
2.4 Formatter directives
|
|
77 |
2.4.1 MZ executable
|
|
78 |
2.4.2 Portable Executable
|
|
79 |
2.4.3 Common Object File Format
|
|
80 |
2.4.4 Executable and Linkable Format
|
81 | 81 |
|
82 | 82 |
|
83 | 83 |
|
|
145 | 145 |
destination file.
|
146 | 146 |
The following is an example of the compilation summary:
|
147 | 147 |
|
148 | |
flat assembler version 1.70 (16384 kilobytes memory)
|
|
148 |
flat assembler version 1.70 (16384 kilobytes memory)
|
149 | 149 |
38 passes, 5.3 seconds, 77824 bytes.
|
150 | 150 |
|
151 | 151 |
In case of error during the compilation process, the program will display an
|
152 | 152 |
error message. For example, when compiler can't find the input file, it will
|
153 | 153 |
display the following message:
|
154 | 154 |
|
155 | |
flat assembler version 1.70 (16384 kilobytes memory)
|
|
155 |
flat assembler version 1.70 (16384 kilobytes memory)
|
156 | 156 |
error: source file not found.
|
157 | 157 |
|
158 | 158 |
If the error is connected with a specific part of source code, the source line
|
159 | 159 |
that caused the error will be also displayed. Also placement of this line in
|
160 | 160 |
the source is given to help you finding this error, for example:
|
161 | 161 |
|
162 | |
flat assembler version 1.70 (16384 kilobytes memory)
|
|
162 |
flat assembler version 1.70 (16384 kilobytes memory)
|
163 | 163 |
example.asm [3]:
|
164 | |
mob ax,1
|
|
164 |
mob ax,1
|
165 | 165 |
error: illegal instruction.
|
166 | 166 |
|
167 | 167 |
It means that in the third line of the "example.asm" file compiler has
|
|
169 | 169 |
contains a macroinstruction, also the line in macroinstruction definition
|
170 | 170 |
that generated the erroneous instruction is displayed:
|
171 | 171 |
|
172 | |
flat assembler version 1.70 (16384 kilobytes memory)
|
|
172 |
flat assembler version 1.70 (16384 kilobytes memory)
|
173 | 173 |
example.asm [6]:
|
174 | |
stoschar 7
|
|
174 |
stoschar 7
|
175 | 175 |
example.asm [3] stoschar [1]:
|
176 | |
mob al,char
|
|
176 |
mob al,char
|
177 | 177 |
error: illegal instruction.
|
178 | 178 |
|
179 | 179 |
It means that the macroinstruction in the sixth line of the "example.asm" file
|
|
258 | 258 |
| xword | 128 | 16 |
|
259 | 259 |
| qqword | 256 | 32 |
|
260 | 260 |
| yword | 256 | 32 |
|
|
261 |
| dqqword | 512 | 64 |
|
|
262 |
| zword | 512 | 64 |
|
261 | 263 |
\-------------------------/
|
262 | 264 |
|
263 | 265 |
Table 1.2 Registers
|
264 | 266 |
/-----------------------------------------------------------------\
|
265 | |
| Type | Bits | |
|
|
267 |
| Type | Bits | |
|
266 | 268 |
|=========|======|================================================|
|
267 | |
| | 8 | al cl dl bl ah ch dh bh |
|
268 | |
| General | 16 | ax cx dx bx sp bp si di |
|
269 | |
| | 32 | eax ecx edx ebx esp ebp esi edi |
|
|
269 |
| | 8 | al cl dl bl ah ch dh bh |
|
|
270 |
| General | 16 | ax cx dx bx sp bp si di |
|
|
271 |
| | 32 | eax ecx edx ebx esp ebp esi edi |
|
270 | 272 |
|---------|------|------------------------------------------------|
|
271 | |
| Segment | 16 | es cs ss ds fs gs |
|
|
273 |
| Segment | 16 | es cs ss ds fs gs |
|
272 | 274 |
|---------|------|------------------------------------------------|
|
273 | |
| Control | 32 | cr0 cr2 cr3 cr4 |
|
|
275 |
| Control | 32 | cr0 cr2 cr3 cr4 |
|
274 | 276 |
|---------|------|------------------------------------------------|
|
275 | |
| Debug | 32 | dr0 dr1 dr2 dr3 dr6 dr7 |
|
|
277 |
| Debug | 32 | dr0 dr1 dr2 dr3 dr6 dr7 |
|
276 | 278 |
|---------|------|------------------------------------------------|
|
277 | |
| FPU | 80 | st0 st1 st2 st3 st4 st5 st6 st7 |
|
|
279 |
| FPU | 80 | st0 st1 st2 st3 st4 st5 st6 st7 |
|
278 | 280 |
|---------|------|------------------------------------------------|
|
279 | |
| MMX | 64 | mm0 mm1 mm2 mm3 mm4 mm5 mm6 mm7 |
|
|
281 |
| MMX | 64 | mm0 mm1 mm2 mm3 mm4 mm5 mm6 mm7 |
|
280 | 282 |
|---------|------|------------------------------------------------|
|
281 | 283 |
| SSE | 128 | xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 |
|
282 | 284 |
|---------|------|------------------------------------------------|
|
283 | 285 |
| AVX | 256 | ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 |
|
|
286 |
|---------|------|------------------------------------------------|
|
|
287 |
| AVX-512 | 512 | zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 |
|
|
288 |
|---------|------|------------------------------------------------|
|
|
289 |
| Opmask | 64 | k0 k1 k2 k3 k4 k5 k6 k7 |
|
|
290 |
|---------|------|------------------------------------------------|
|
|
291 |
| Bounds | 128 | bnd0 bnd1 bnd2 bnd3 |
|
284 | 292 |
\-----------------------------------------------------------------/
|
285 | 293 |
|
286 | 294 |
|
|
331 | 339 |
| Size | Define | Reserve |
|
332 | 340 |
| (bytes) | data | data |
|
333 | 341 |
|=========|========|=========|
|
334 | |
| 1 | db | rb |
|
335 | |
| | file | |
|
|
342 |
| 1 | db | rb |
|
|
343 |
| | file | |
|
336 | 344 |
|---------|--------|---------|
|
337 | |
| 2 | dw | rw |
|
338 | |
| | du | |
|
|
345 |
| 2 | dw | rw |
|
|
346 |
| | du | |
|
339 | 347 |
|---------|--------|---------|
|
340 | |
| 4 | dd | rd |
|
|
348 |
| 4 | dd | rd |
|
341 | 349 |
|---------|--------|---------|
|
342 | |
| 6 | dp | rp |
|
343 | |
| | df | rf |
|
|
350 |
| 6 | dp | rp |
|
|
351 |
| | df | rf |
|
344 | 352 |
|---------|--------|---------|
|
345 | |
| 8 | dq | rq |
|
|
353 |
| 8 | dq | rq |
|
346 | 354 |
|---------|--------|---------|
|
347 | |
| 10 | dt | rt |
|
|
355 |
| 10 | dt | rt |
|
348 | 356 |
\----------------------------/
|
349 | 357 |
|
350 | 358 |
|
|
454 | 462 |
/-------------------------\
|
455 | 463 |
| Priority | Operators |
|
456 | 464 |
|==========|==============|
|
457 | |
| 0 | + - |
|
|
465 |
| 0 | + - |
|
458 | 466 |
|----------|--------------|
|
459 | |
| 1 | * / |
|
|
467 |
| 1 | * / |
|
460 | 468 |
|----------|--------------|
|
461 | |
| 2 | mod |
|
|
469 |
| 2 | mod |
|
462 | 470 |
|----------|--------------|
|
463 | |
| 3 | and or xor |
|
|
471 |
| 3 | and or xor |
|
464 | 472 |
|----------|--------------|
|
465 | |
| 4 | shl shr |
|
|
473 |
| 4 | shl shr |
|
466 | 474 |
|----------|--------------|
|
467 | |
| 5 | not |
|
|
475 |
| 5 | not |
|
468 | 476 |
|----------|--------------|
|
469 | |
| 6 | bsf bsr |
|
|
477 |
| 6 | bsf bsr |
|
470 | 478 |
|----------|--------------|
|
471 | |
| 7 | rva plt |
|
|
479 |
| 7 | rva plt |
|
472 | 480 |
\-------------------------/
|
473 | 481 |
|
474 | 482 |
|
|
553 | 561 |
operand are the same. Below are the examples for each of the allowed
|
554 | 562 |
combinations:
|
555 | 563 |
|
556 | |
mov bx,ax ; general register to general register
|
|
564 |
mov bx,ax ; general register to general register
|
557 | 565 |
mov [char],al ; general register to memory
|
558 | 566 |
mov bl,[char] ; memory to general register
|
559 | |
mov dl,32 ; immediate value to general register
|
|
567 |
mov dl,32 ; immediate value to general register
|
560 | 568 |
mov [char],32 ; immediate value to memory
|
561 | |
mov ax,ds ; segment register to general register
|
|
569 |
mov ax,ds ; segment register to general register
|
562 | 570 |
mov [bx],ds ; segment register to memory
|
563 | |
mov ds,ax ; general register to segment register
|
|
571 |
mov ds,ax ; general register to segment register
|
564 | 572 |
mov ds,[bx] ; memory to segment register
|
565 | 573 |
mov eax,cr0 ; control register to general register
|
566 | 574 |
mov cr3,ebx ; general register to control register
|
|
570 | 578 |
important. The operands may be two general registers, or general register
|
571 | 579 |
with memory. For example:
|
572 | 580 |
|
573 | |
xchg ax,bx ; swap two general registers
|
|
581 |
xchg ax,bx ; swap two general registers
|
574 | 582 |
xchg al,[char] ; swap register with memory
|
575 | 583 |
|
576 | 584 |
"push" decrements the stack frame pointer (ESP register), then transfers
|
|
584 | 592 |
spaces, not commas), compiler will assemble chain of the "push" instructions
|
585 | 593 |
with these operands. The examples are with single operands:
|
586 | 594 |
|
587 | |
push ax ; store general register
|
588 | |
push es ; store segment register
|
589 | |
pushw [bx] ; store memory
|
590 | |
push 1000h ; store immediate value
|
|
595 |
push ax ; store general register
|
|
596 |
push es ; store segment register
|
|
597 |
pushw [bx] ; store memory
|
|
598 |
push 1000h ; store immediate value
|
591 | 599 |
|
592 | 600 |
"pusha" saves the contents of the eight general register on the stack.
|
593 | 601 |
This instruction has no operands. There are two version of this instruction,
|
|
606 | 614 |
follow in the same line, compiler will assemble chain of the "pop"
|
607 | 615 |
instructions with these operands.
|
608 | 616 |
|
609 | |
pop bx ; restore general register
|
610 | |
pop ds ; restore segment register
|
611 | |
popw [si] ; restore memory
|
|
617 |
pop bx ; restore general register
|
|
618 |
pop ds ; restore segment register
|
|
619 |
popw [si] ; restore memory
|
612 | 620 |
|
613 | 621 |
"popa" restores the registers saved on the stack by "pusha" instruction,
|
614 | 622 |
except for the saved value of SP (or ESP), which is ignored. This instruction
|
|
634 | 642 |
extension. The source operand can be general register or memory, while the
|
635 | 643 |
destination operand must be a general register. For example:
|
636 | 644 |
|
637 | |
movsx ax,al ; byte register to word register
|
638 | |
movsx edx,dl ; byte register to double word register
|
639 | |
movsx eax,ax ; word register to double word register
|
640 | |
movsx ax,byte [bx] ; byte memory to word register
|
|
645 |
movsx ax,al ; byte register to word register
|
|
646 |
movsx edx,dl ; byte register to double word register
|
|
647 |
movsx eax,ax ; word register to double word register
|
|
648 |
movsx ax,byte [bx] ; byte memory to word register
|
641 | 649 |
movsx edx,byte [bx] ; byte memory to double word register
|
642 | 650 |
movsx eax,word [bx] ; word memory to double word register
|
643 | 651 |
|
|
650 | 658 |
register or memory, the source operand can be general register or immediate
|
651 | 659 |
value, it can also be memory if the destination operand is register.
|
652 | 660 |
|
653 | |
add ax,bx ; add register to register
|
|
661 |
add ax,bx ; add register to register
|
654 | 662 |
add ax,[si] ; add memory to register
|
655 | 663 |
add [di],al ; add register to memory
|
656 | |
add al,48 ; add immediate value to register
|
|
664 |
add al,48 ; add immediate value to register
|
657 | 665 |
add [char],48 ; add immediate value to memory
|
658 | 666 |
|
659 | 667 |
"adc" sums the operands, adds one if CF is set, and replaces the destination
|
|
664 | 672 |
general register or memory, and the size of the operand can be byte, word or
|
665 | 673 |
double word.
|
666 | 674 |
|
667 | |
inc ax ; increment register by one
|
|
675 |
inc ax ; increment register by one
|
668 | 676 |
inc byte [bx] ; increment memory by one
|
669 | 677 |
|
670 | 678 |
"sub" subtracts the source operand from the destination operand and replaces
|
|
720 | 728 |
because, whether the operands are signed or unsigned, the lower half of the
|
721 | 729 |
product is the same. Below are the examples for all three forms:
|
722 | 730 |
|
723 | |
imul bl ; accumulator by register
|
|
731 |
imul bl ; accumulator by register
|
724 | 732 |
imul word [si] ; accumulator by memory
|
725 | |
imul bx,cx ; register by register
|
|
733 |
imul bx,cx ; register by register
|
726 | 734 |
imul bx,[si] ; register by memory
|
727 | |
imul bx,10 ; register by immediate value
|
|
735 |
imul bx,10 ; register by immediate value
|
728 | 736 |
imul ax,bx,10 ; register by immediate value to register
|
729 | 737 |
imul ax,[si],10 ; memory by immediate value to register
|
730 | 738 |
|
|
805 | 813 |
1, "btr" resets the selected bit to 0, "btc" changes the bit to its
|
806 | 814 |
complement. The first operand can be word or double word.
|
807 | 815 |
|
808 | |
bt ax,15 ; test bit in register
|
|
816 |
bt ax,15 ; test bit in register
|
809 | 817 |
bts word [bx],15 ; test and set bit in memory
|
810 | |
btr ax,cx ; test and reset bit in register
|
|
818 |
btr ax,cx ; test and reset bit in register
|
811 | 819 |
btc word [bx],cx ; test and complement bit in memory
|
812 | 820 |
|
813 | 821 |
"bsf" and "bsr" instructions scan a word or double word for first set bit
|
|
820 | 828 |
order to low order (starting from bit index 15 of a word or index 31 of a
|
821 | 829 |
double word).
|
822 | 830 |
|
823 | |
bsf ax,bx ; scan register forward
|
|
831 |
bsf ax,bx ; scan register forward
|
824 | 832 |
bsr ax,[si] ; scan memory reverse
|
825 | 833 |
|
826 | 834 |
"shl" shifts the destination operand left by the number of bits specified
|
|
830 | 838 |
side of the operand as bits exit from the left side. The last bit that exited
|
831 | 839 |
is stored in CF. "sal" is a synonym for "shl".
|
832 | 840 |
|
833 | |
shl al,1 ; shift register left by one bit
|
|
841 |
shl al,1 ; shift register left by one bit
|
834 | 842 |
shl byte [bx],1 ; shift memory left by one bit
|
835 | |
shl ax,cl ; shift register left by count from cl
|
|
843 |
shl ax,cl ; shift register left by count from cl
|
836 | 844 |
shl word [bx],cl ; shift memory left by count from cl
|
837 | 845 |
|
838 | 846 |
"shr" and "sar" shift the destination operand right by the number of bits
|
|
879 | 887 |
bits 16 through 23. This instruction is provided for converting little-endian
|
880 | 888 |
values to big-endian format and vice versa.
|
881 | 889 |
|
882 | |
bswap edx ; swap bytes in register
|
|
890 |
bswap edx ; swap bytes in register
|
883 | 891 |
|
884 | 892 |
|
885 | 893 |
2.1.6 Control transfer instructions
|
|
903 | 911 |
variable, the operand should be general register or memory. See also 1.2.5 for
|
904 | 912 |
some more details.
|
905 | 913 |
|
906 | |
jmp 100h ; direct near jump
|
|
914 |
jmp 100h ; direct near jump
|
907 | 915 |
jmp 0FFFFh:0 ; direct far jump
|
908 | |
jmp ax ; indirect near jump
|
|
916 |
jmp ax ; indirect near jump
|
909 | 917 |
jmp pword [ebx] ; indirect far jump
|
910 | 918 |
|
911 | 919 |
"call" transfers control to the procedure, saving on the stack the address
|
|
942 | 950 |
|
943 | 951 |
Table 2.1 Conditions
|
944 | 952 |
/-----------------------------------------------------------\
|
945 | |
| Mnemonic | Condition tested | Description |
|
|
953 |
| Mnemonic | Condition tested | Description |
|
946 | 954 |
|==========|=======================|========================|
|
947 | |
| o | OF = 1 | overflow |
|
|
955 |
| o | OF = 1 | overflow |
|
948 | 956 |
|----------|-----------------------|------------------------|
|
949 | |
| no | OF = 0 | not overflow |
|
|
957 |
| no | OF = 0 | not overflow |
|
950 | 958 |
|----------|-----------------------|------------------------|
|
951 | |
| c | | carry |
|
952 | |
| b | CF = 1 | below |
|
953 | |
| nae | | not above nor equal |
|
|
959 |
| c | | carry |
|
|
960 |
| b | CF = 1 | below |
|
|
961 |
| nae | | not above nor equal |
|
954 | 962 |
|----------|-----------------------|------------------------|
|
955 | |
| nc | | not carry |
|
956 | |
| ae | CF = 0 | above or equal |
|
957 | |
| nb | | not below |
|
|
963 |
| nc | | not carry |
|
|
964 |
| ae | CF = 0 | above or equal |
|
|
965 |
| nb | | not below |
|
958 | 966 |
|----------|-----------------------|------------------------|
|
959 | |
| e | ZF = 1 | equal |
|
960 | |
| z | | zero |
|
|
967 |
| e | ZF = 1 | equal |
|
|
968 |
| z | | zero |
|
961 | 969 |
|----------|-----------------------|------------------------|
|
962 | |
| ne | ZF = 0 | not equal |
|
963 | |
| nz | | not zero |
|
|
970 |
| ne | ZF = 0 | not equal |
|
|
971 |
| nz | | not zero |
|
964 | 972 |
|----------|-----------------------|------------------------|
|
965 | |
| be | CF or ZF = 1 | below or equal |
|
966 | |
| na | | not above |
|
|
973 |
| be | CF or ZF = 1 | below or equal |
|
|
974 |
| na | | not above |
|
967 | 975 |
|----------|-----------------------|------------------------|
|
968 | |
| a | CF or ZF = 0 | above |
|
969 | |
| nbe | | not below nor equal |
|
|
976 |
| a | CF or ZF = 0 | above |
|
|
977 |
| nbe | | not below nor equal |
|
970 | 978 |
|----------|-----------------------|------------------------|
|
971 | |
| s | SF = 1 | sign |
|
|
979 |
| s | SF = 1 | sign |
|
972 | 980 |
|----------|-----------------------|------------------------|
|
973 | |
| ns | SF = 0 | not sign |
|
|
981 |
| ns | SF = 0 | not sign |
|
974 | 982 |
|----------|-----------------------|------------------------|
|
975 | |
| p | PF = 1 | parity |
|
976 | |
| pe | | parity even |
|
|
983 |
| p | PF = 1 | parity |
|
|
984 |
| pe | | parity even |
|
977 | 985 |
|----------|-----------------------|------------------------|
|
978 | |
| np | PF = 0 | not parity |
|
979 | |
| po | | parity odd |
|
|
986 |
| np | PF = 0 | not parity |
|
|
987 |
| po | | parity odd |
|
980 | 988 |
|----------|-----------------------|------------------------|
|
981 | |
| l | SF xor OF = 1 | less |
|
982 | |
| nge | | not greater nor equal |
|
|
989 |
| l | SF xor OF = 1 | less |
|
|
990 |
| nge | | not greater nor equal |
|
983 | 991 |
|----------|-----------------------|------------------------|
|
984 | |
| ge | SF xor OF = 0 | greater or equal |
|
985 | |
| nl | | not less |
|
|
992 |
| ge | SF xor OF = 0 | greater or equal |
|
|
993 |
| nl | | not less |
|
986 | 994 |
|----------|-----------------------|------------------------|
|
987 | |
| le | (SF xor OF) or ZF = 1 | less or equal |
|
988 | |
| ng | | not greater |
|
|
995 |
| le | (SF xor OF) or ZF = 1 | less or equal |
|
|
996 |
| ng | | not greater |
|
989 | 997 |
|----------|-----------------------|------------------------|
|
990 | |
| g | (SF xor OF) or ZF = 0 | greater |
|
991 | |
| nle | | not less nor equal |
|
|
998 |
| g | (SF xor OF) or ZF = 0 | greater |
|
|
999 |
| nle | | not less nor equal |
|
992 | 1000 |
\-----------------------------------------------------------/
|
993 | 1001 |
|
994 | 1002 |
The "loop" instructions are conditional jumps that use a value placed in
|
|
1037 | 1045 |
operand should be AL, AX, or EAX register. The source operand should be an
|
1038 | 1046 |
immediate value in range from 0 to 255, or DX register.
|
1039 | 1047 |
|
1040 | |
in al,20h ; input byte from port 20h
|
1041 | |
in ax,dx ; input word from port addressed by dx
|
|
1048 |
in al,20h ; input byte from port 20h
|
|
1049 |
in ax,dx ; input word from port addressed by dx
|
1042 | 1050 |
|
1043 | 1051 |
"out" transfers a byte, word, or double word to an output port from AL, AX,
|
1044 | 1052 |
or EAX. The program can specify the number of the port using the same methods
|
|
1046 | 1054 |
in range from 0 to 255, or DX register. The source operand should be AL, AX,
|
1047 | 1055 |
or EAX register.
|
1048 | 1056 |
|
1049 | |
out 20h,ax ; output word to port 20h
|
1050 | |
out dx,al ; output byte to port addressed by dx
|
|
1057 |
out 20h,ax ; output word to port 20h
|
|
1058 |
out dx,al ; output byte to port addressed by dx
|
1051 | 1059 |
|
1052 | 1060 |
|
1053 | 1061 |
2.1.8 Strings operations
|
|
1077 | 1085 |
|
1078 | 1086 |
movs byte [di],[si] ; transfer byte
|
1079 | 1087 |
movs word [es:di],[ss:si] ; transfer word
|
1080 | |
movsd ; transfer double word
|
|
1088 |
movsd ; transfer double word
|
1081 | 1089 |
|
1082 | 1090 |
"cmps" subtracts the destination string element from the source string
|
1083 | 1091 |
element and updates the flags AF, SF, PF, CF and OF, but it does not change
|
|
1087 | 1095 |
second operand should be the destination string element addressed by DI or
|
1088 | 1096 |
EDI.
|
1089 | 1097 |
|
1090 | |
cmpsb ; compare bytes
|
|
1098 |
cmpsb ; compare bytes
|
1091 | 1099 |
cmps word [ds:si],[es:di] ; compare words
|
1092 | 1100 |
cmps dword [fs:esi],[edi] ; compare double words
|
1093 | 1101 |
|
|
1096 | 1104 |
PF, CF and OF. If the values are equal, ZF is set, otherwise it is cleared.
|
1097 | 1105 |
The operand should be the destination string element addressed by DI or EDI.
|
1098 | 1106 |
|
1099 | |
scas byte [es:di] ; scan byte
|
1100 | |
scasw ; scan word
|
|
1107 |
scas byte [es:di] ; scan byte
|
|
1108 |
scasw ; scan word
|
1101 | 1109 |
scas dword [es:edi] ; scan double word
|
1102 | 1110 |
|
1103 | 1111 |
"stos" places the value of AL, AX, or EAX into the destination string
|
|
1106 | 1114 |
should be the source string element addressed by SI or ESI with any segment
|
1107 | 1115 |
prefix.
|
1108 | 1116 |
|
1109 | |
lods byte [ds:si] ; load byte
|
1110 | |
lods word [cs:si] ; load word
|
1111 | |
lodsd ; load double word
|
|
1117 |
lods byte [ds:si] ; load byte
|
|
1118 |
lods word [cs:si] ; load word
|
|
1119 |
lodsd ; load double word
|
1112 | 1120 |
|
1113 | 1121 |
"ins" transfers a byte, word, or double word from an input port addressed
|
1114 | 1122 |
by DX register to the destination string element. The destination operand
|
1115 | 1123 |
should be memory addressed by DI or EDI, the source operand should be the DX
|
1116 | 1124 |
register.
|
1117 | 1125 |
|
1118 | |
insb ; input byte
|
|
1126 |
insb ; input byte
|
1119 | 1127 |
ins word [es:di],dx ; input word
|
1120 | |
ins dword [edi],dx ; input double word
|
|
1128 |
ins dword [edi],dx ; input double word
|
1121 | 1129 |
|
1122 | 1130 |
"outs" transfers the source string element to an output port addressed by
|
1123 | 1131 |
DX register. The destination operand should be the DX register and the source
|
1124 | 1132 |
operand should be memory addressed by SI or ESI with any segment prefix.
|
1125 | 1133 |
|
1126 | |
outs dx,byte [si] ; output byte
|
1127 | |
outsw ; output word
|
|
1134 |
outs dx,byte [si] ; output byte
|
|
1135 |
outsw ; output word
|
1128 | 1136 |
outs dx,dword [gs:esi] ; output double word
|
1129 | 1137 |
|
1130 | 1138 |
The repeat prefixes "rep", "repe"/"repz", and "repne"/"repnz" specify
|
|
1141 | 1149 |
the execution when the ZF is zero, "repne" and "repnz" terminate the execution
|
1142 | 1150 |
when the ZF is set.
|
1143 | 1151 |
|
1144 | |
rep movsd ; transfer multiple double words
|
1145 | |
repe cmpsb ; compare bytes until not equal
|
|
1152 |
rep movsd ; transfer multiple double words
|
|
1153 |
repe cmpsb ; compare bytes until not equal
|
1146 | 1154 |
|
1147 | 1155 |
|
1148 | 1156 |
2.1.9 Flag control instructions
|
|
1169 | 1177 |
and "popfd" forces restoring from the double word.
|
1170 | 1178 |
|
1171 | 1179 |
|
1172 | |
2.1.10 Conditional operations
|
|
1180 |
2.1.10 Conditional operations
|
1173 | 1181 |
|
1174 | 1182 |
The instructions obtained by attaching the condition mnemonic (see table
|
1175 | 1183 |
2.1) to the "set" mnemonic set a byte to one if the condition is true and set
|
1176 | 1184 |
the byte to zero otherwise. The operand should be an 8-bit be general register
|
1177 | 1185 |
or the byte in memory.
|
1178 | 1186 |
|
1179 | |
setne al ; set al if zero flag cleared
|
|
1187 |
setne al ; set al if zero flag cleared
|
1180 | 1188 |
seto byte [bx] ; set byte if overflow
|
1181 | 1189 |
|
1182 | 1190 |
"salc" instruction sets the all bits of AL register when the carry flag is
|
|
1208 | 1216 |
cmpxchg8b [bx] ; compare and exchange 8 bytes
|
1209 | 1217 |
|
1210 | 1218 |
|
1211 | |
2.1.11 Miscellaneous instructions
|
|
1219 |
2.1.11 Miscellaneous instructions
|
1212 | 1220 |
|
1213 | 1221 |
"nop" instruction occupies one byte but affects nothing but the instruction
|
1214 | 1222 |
pointer. This instruction has no operands and doesn't perform any operation.
|
|
1269 | 1277 |
enter 2048,0 ; enter and allocate 2048 bytes on stack
|
1270 | 1278 |
|
1271 | 1279 |
|
1272 | |
2.1.12 System instructions
|
|
1280 |
2.1.12 System instructions
|
1273 | 1281 |
|
1274 | 1282 |
"lmsw" loads the operand into the machine status word (bits 0 through 15 of
|
1275 | 1283 |
CR0 register), while "smsw" stores the machine status word into the
|
|
1277 | 1285 |
general register or memory, for "smsw" it can also be 32-bit general
|
1278 | 1286 |
register.
|
1279 | 1287 |
|
1280 | |
lmsw ax ; load machine status from register
|
1281 | |
smsw [bx] ; store machine status to memory
|
|
1288 |
lmsw ax ; load machine status from register
|
|
1289 |
smsw [bx] ; store machine status to memory
|
1282 | 1290 |
|
1283 | 1291 |
"lgdt" and "lidt" instructions load the values in operand into the global
|
1284 | 1292 |
descriptor table register or the interrupt descriptor table register
|
|
1286 | 1294 |
table register or the interrupt descriptor table register in the destination
|
1287 | 1295 |
operand. The operand should be a 6 bytes in memory.
|
1288 | 1296 |
|
1289 | |
lgdt [ebx] ; load global descriptor table
|
|
1297 |
lgdt [ebx] ; load global descriptor table
|
1290 | 1298 |
|
1291 | 1299 |
"lldt" loads the operand into the segment selector field of the local
|
1292 | 1300 |
descriptor table register and "sldt" stores the segment selector from the
|
|
1300 | 1308 |
The source operand should be a 16-bit general register or memory.
|
1301 | 1309 |
|
1302 | 1310 |
lar ax,[bx] ; load access rights into word
|
1303 | |
lar eax,dx ; load access rights into double word
|
|
1311 |
lar eax,dx ; load access rights into double word
|
1304 | 1312 |
|
1305 | 1313 |
"lsl" loads the segment limit from the segment descriptor specified by the
|
1306 | 1314 |
selector in source operand into the destination operand and sets the ZF flag.
|
|
1320 | 1328 |
destination operand. The destination operand can be a word general register
|
1321 | 1329 |
or memory, the source operand must be a general register.
|
1322 | 1330 |
|
1323 | |
arpl bx,ax ; adjust RPL of selector in register
|
|
1331 |
arpl bx,ax ; adjust RPL of selector in register
|
1324 | 1332 |
arpl [bx],ax ; adjust RPL of selector in memory
|
1325 | 1333 |
|
1326 | 1334 |
"clts" clears the TS (task switched) flag in the CR0 register. This
|
|
1366 | 1374 |
instructions are stored in MSRs. These instructions have no operands.
|
1367 | 1375 |
|
1368 | 1376 |
|
1369 | |
2.1.13 FPU instructions
|
|
1377 |
2.1.13 FPU instructions
|
1370 | 1378 |
|
1371 | 1379 |
The FPU (Floating-Point Unit) instructions operate on the floating-point
|
1372 | 1380 |
values in three formats: single precision (32-bit), double precision (64-bit)
|
|
1382 | 1390 |
format.
|
1383 | 1391 |
|
1384 | 1392 |
fld dword [bx] ; load single prevision value from memory
|
1385 | |
fld st2 ; push value of st2 onto register stack
|
|
1393 |
fld st2 ; push value of st2 onto register stack
|
1386 | 1394 |
|
1387 | 1395 |
"fld1", "fldz", "fldl2t", "fldl2e", "fldpi", "fldlg2" and "fldln2" load the
|
1388 | 1396 |
commonly used contants onto the FPU register stack. The loaded constants are
|
|
1400 | 1408 |
getting rid of ST0. "fstp" accepts the same operands as the "fst" instruction
|
1401 | 1409 |
and can also store value in the 80-bit memory.
|
1402 | 1410 |
|
1403 | |
fst st3 ; copy value of st0 into st3 register
|
|
1411 |
fst st3 ; copy value of st0 into st3 register
|
1404 | 1412 |
fstp tword [bx] ; store value in memory and pop stack
|
1405 | 1413 |
|
1406 | 1414 |
"fist" converts the value in ST0 to a signed integer and stores the result
|
|
1429 | 1437 |
must be an FPU register and the source operand must be the ST0. When no
|
1430 | 1438 |
operands are specified, ST1 is used as a destination operand.
|
1431 | 1439 |
|
1432 | |
faddp ; add st0 to st1 and pop the stack
|
|
1440 |
faddp ; add st0 to st1 and pop the stack
|
1433 | 1441 |
faddp st2,st0 ; add st0 to st2 and pop the stack
|
1434 | 1442 |
|
1435 | 1443 |
"fiadd" instruction converts an integer source operand into double extended
|
|
1440 | 1448 |
|
1441 | 1449 |
"fsub", "fsubr", "fmul", "fdiv", "fdivr" instruction are similar to "fadd",
|
1442 | 1450 |
have the same rules for operands and differ only in the perfomed computation.
|
1443 | |
"fsub" substracts the source operand from the destination operand, "fsubr"
|
1444 | |
substract the destination operand from the source operand, "fmul" multiplies
|
|
1451 |
"fsub" subtracts the source operand from the destination operand, "fsubr"
|
|
1452 |
subtract the destination operand from the source operand, "fmul" multiplies
|
1445 | 1453 |
the destination and source operands, "fdiv" divides the destination operand by
|
1446 | 1454 |
the source operand and "fdivr" divides the source operand by the destination
|
1447 | 1455 |
operand. "fsubp", "fsubrp", "fmulp", "fdivp", "fdivrp" perform the same
|
|
1455 | 1463 |
"fchs" complements its sign bit, "fabs" clears its sign to create the absolute
|
1456 | 1464 |
value, "frndint" rounds it to the nearest integral value, depending on the
|
1457 | 1465 |
current rounding mode. "f2xm1" computes the exponential value of 2 to the
|
1458 | |
power of ST0 and substracts the 1.0 from it, the value of ST0 must lie in the
|
|
1466 |
power of ST0 and subtracts the 1.0 from it, the value of ST0 must lie in the
|
1459 | 1467 |
range -1.0 to +1.0. All these instructions store the result in ST0 and have no
|
1460 | 1468 |
operands.
|
1461 | 1469 |
"fsincos" computes both the sine and the cosine of the value in ST0
|
|
1483 | 1491 |
operand can be a single or double precision value in memory or the FPU
|
1484 | 1492 |
register. When no operand is specified, ST1 is used as a source operand.
|
1485 | 1493 |
|
1486 | |
fcom ; compare st0 with st1
|
1487 | |
fcomp st2 ; compare st0 with st2 and pop stack
|
|
1494 |
fcom ; compare st0 with st1
|
|
1495 |
fcomp st2 ; compare st0 with st2 and pop stack
|
1488 | 1496 |
|
1489 | 1497 |
"fcompp" compares the contents of ST0 and ST1, sets flags in the FPU status
|
1490 | 1498 |
word according to the results and pops the register stack twice. This
|
|
1512 | 1520 |
should be ST0 register and the second operand specifies the source FPU
|
1513 | 1521 |
register.
|
1514 | 1522 |
|
1515 | |
fcomi st2 ; compare st0 with st2 and set flags
|
|
1523 |
fcomi st2 ; compare st0 with st2 and set flags
|
1516 | 1524 |
fcmovb st0,st2 ; transfer st2 to st0 if below
|
1517 | 1525 |
|
1518 | 1526 |
Table 2.2 FPU conditions
|
1519 | 1527 |
/------------------------------------------------------\
|
1520 | |
| Mnemonic | Condition tested | Description |
|
|
1528 |
| Mnemonic | Condition tested | Description |
|
1521 | 1529 |
|==========|==================|========================|
|
1522 | |
| b | CF = 1 | below |
|
1523 | |
| e | ZF = 1 | equal |
|
1524 | |
| be | CF or ZF = 1 | below or equal |
|
1525 | |
| u | PF = 1 | unordered |
|
1526 | |
| nb | CF = 0 | not below |
|
1527 | |
| ne | ZF = 0 | not equal |
|
1528 | |
| nbe | CF and ZF = 0 | not below nor equal |
|
1529 | |
| nu | PF = 0 | not unordered |
|
|
1530 |
| b | CF = 1 | below |
|
|
1531 |
| e | ZF = 1 | equal |
|
|
1532 |
| be | CF or ZF = 1 | below or equal |
|
|
1533 |
| u | PF = 1 | unordered |
|
|
1534 |
| nb | CF = 0 | not below |
|
|
1535 |
| ne | ZF = 0 | not equal |
|
|
1536 |
| nbe | CF and ZF = 0 | not below nor equal |
|
|
1537 |
| nu | PF = 0 | not unordered |
|
1530 | 1538 |
\------------------------------------------------------/
|
1531 | 1539 |
|
1532 | 1540 |
"ftst" compares the value in ST0 with 0.0 and sets the flags in the FPU
|
|
1568 | 1576 |
"ffree" sets the tag associated with specified FPU register to empty. The
|
1569 | 1577 |
operand should be an FPU register.
|
1570 | 1578 |
"fincstp" and "fdecstp" rotate the FPU stack by one by adding or
|
1571 | |
substracting one to the pointer of the top of stack. These instructions have no
|
|
1579 |
subtracting one to the pointer of the top of stack. These instructions have no
|
1572 | 1580 |
operands.
|
1573 | 1581 |
|
1574 | 1582 |
|
1575 | |
2.1.14 MMX instructions
|
|
1583 |
2.1.14 MMX instructions
|
1576 | 1584 |
|
1577 | 1585 |
The MMX instructions operate on the packed integer types and use the MMX
|
1578 | 1586 |
registers, which are the low 64-bit parts of the 80-bit FPU registers. Because
|
|
1597 | 1605 |
source and destination operand and stored in the data elements of the
|
1598 | 1606 |
destination operand. "paddb", "paddw" and "paddd" perform the addition of
|
1599 | 1607 |
packed bytes, packed words, or packed double words. "psubb", "psubw" and
|
1600 | |
"psubd" perform the substraction of appropriate types. "paddsb", "paddsw",
|
1601 | |
"psubsb" and "psubsw" perform the addition or substraction of packed bytes
|
|
1608 |
"psubd" perform the subtraction of appropriate types. "paddsb", "paddsw",
|
|
1609 |
"psubsb" and "psubsw" perform the addition or subtraction of packed bytes
|
1602 | 1610 |
or packed words with the signed saturation. "paddusb", "paddusw", "psubusb",
|
1603 | 1611 |
"psubusw" are analoguous, but with unsigned saturation. "pmulhw" and "pmullw"
|
1604 | 1612 |
performs a signed multiplication of the packed words and store the high or low
|
|
1644 | 1652 |
used before using the FPU instructions if any MMX instructions were used.
|
1645 | 1653 |
|
1646 | 1654 |
|
1647 | |
2.1.15 SSE instructions
|
|
1655 |
2.1.15 SSE instructions
|
1648 | 1656 |
|
1649 | 1657 |
The SSE extension adds more MMX instructions and also introduces the
|
1650 | 1658 |
operations on packed single precision floating point values. The 128-bit
|
|
1695 | 1703 |
must be a SSE register and the operation is performed on single precision
|
1696 | 1704 |
values, only low double words of SSE registers are used in this case, the
|
1697 | 1705 |
result is stored in the low double word of destination register. "addps" and
|
1698 | |
"addss" add the values, "subps" and "subss" substract the source value from
|
|
1706 |
"addss" add the values, "subps" and "subss" subtract the source value from
|
1699 | 1707 |
destination value, "mulps" and "mulss" multiply the values, "divps" and
|
1700 | 1708 |
"divss" divide the destination value by the source value, "rcpps" and "rcpss"
|
1701 | 1709 |
compute the approximate reciprocal of the source value, "sqrtps" and "sqrtss"
|
|
1728 | 1736 |
|
1729 | 1737 |
Table 2.3 SSE conditions
|
1730 | 1738 |
/-------------------------------------------\
|
1731 | |
| Code | Mnemonic | Description |
|
|
1739 |
| Code | Mnemonic | Description |
|
1732 | 1740 |
|======|==========|=========================|
|
1733 | |
| 0 | eq | equal |
|
1734 | |
| 1 | lt | less than |
|
1735 | |
| 2 | le | less than or equal |
|
1736 | |
| 3 | unord | unordered |
|
1737 | |
| 4 | neq | not equal |
|
1738 | |
| 5 | nlt | not less than |
|
1739 | |
| 6 | nle | not less than nor equal |
|
1740 | |
| 7 | ord | ordered |
|
|
1741 |
| 0 | eq | equal |
|
|
1742 |
| 1 | lt | less than |
|
|
1743 |
| 2 | le | less than or equal |
|
|
1744 |
| 3 | unord | unordered |
|
|
1745 |
| 4 | neq | not equal |
|
|
1746 |
| 5 | nlt | not less than |
|
|
1747 |
| 6 | nle | not less than nor equal |
|
|
1748 |
| 7 | ord | ordered |
|
1741 | 1749 |
\-------------------------------------------/
|
1742 | 1750 |
|
1743 | 1751 |
"comiss" and "ucomiss" compare the single precision values and set the ZF,
|
|
1859 | 1867 |
of no specified size.
|
1860 | 1868 |
|
1861 | 1869 |
|
1862 | |
2.1.16 SSE2 instructions
|
|
1870 |
2.1.16 SSE2 instructions
|
1863 | 1871 |
|
1864 | 1872 |
The SSE2 extension introduces the operations on packed double precision
|
1865 | 1873 |
floating point values, extends the syntax of MMX instructions, and adds also
|
|
1980 | 1988 |
is introduced, which performs the same operation as "pshufw", but on the
|
1981 | 1989 |
double words instead of words, it allows only the extended syntax.
|
1982 | 1990 |
|
1983 | |
psubb xmm0,[esi] ; substract 16 packed bytes
|
|
1991 |
psubb xmm0,[esi] ; subtract 16 packed bytes
|
1984 | 1992 |
pextrw eax,xmm0,7 ; extract highest word into eax
|
1985 | 1993 |
|
1986 | 1994 |
"paddq" performs the addition of packed quad words, "psubq" performs the
|
1987 | |
substraction of packed quad words, "pmuludq" performs an unsigned
|
|
1995 |
subtraction of packed quad words, "pmuludq" performs an unsigned
|
1988 | 1996 |
multiplication of low double words from each corresponding quad words and
|
1989 | 1997 |
returns the results in packed quad words. These instructions follow the same
|
1990 | 1998 |
rules for operands as the general MMX operations described in 2.1.14.
|
|
2020 | 2028 |
"lfence" instructions. These instructions have no operands.
|
2021 | 2029 |
|
2022 | 2030 |
|
2023 | |
2.1.17 SSE3 instructions
|
|
2031 |
2.1.17 SSE3 instructions
|
2024 | 2032 |
|
2025 | 2033 |
Prescott technology introduced some new instructions to improve the performance
|
2026 | 2034 |
of SSE and SSE2 - this extension is called SSE3.
|
|
2041 | 2049 |
cacheline boundary. The destination operand has to be SSE register, the source
|
2042 | 2050 |
operand must be 128-bit memory location.
|
2043 | 2051 |
"addsubps" performs single precision addition of second and fourth pairs and
|
2044 | |
single precision substracion of the first and third pairs of floating point
|
|
2052 |
single precision subtracion of the first and third pairs of floating point
|
2045 | 2053 |
values in the operands. "addsubpd" performs double precision addition of the
|
2046 | |
second pair and double precision substraction of the first pair of floating
|
|
2054 |
second pair and double precision subtraction of the first pair of floating
|
2047 | 2055 |
point values in the operand. "haddps" performs the addition of two single
|
2048 | 2056 |
precision values within the each quad word of source and destination operands,
|
2049 | 2057 |
and stores the results of such horizontal addition of values from destination
|
|
2068 | 2076 |
destination register). They operate on 16-bit or 32-bit chunks, respectively.
|
2069 | 2077 |
"phaddsw" performs the same operation on signed 16-bit packed values, but the
|
2070 | 2078 |
result of each addition is saturated. "phsubw" and "phsubd" analogously
|
2071 | |
perform the horizontal substraction of 16-bit or 32-bit packed value, and
|
2072 | |
"phsubsw" performs the horizontal substraction of signed 16-bit packed values
|
|
2079 |
perform the horizontal subtraction of 16-bit or 32-bit packed value, and
|
|
2080 |
"phsubsw" performs the horizontal subtraction of signed 16-bit packed values
|
2073 | 2081 |
with saturation.
|
2074 | 2082 |
"pabsb", "pabsw" and "pabsd" calculate the absolute value of each signed
|
2075 | 2083 |
packed signed value in source operand and stores them into the destination
|
|
2099 | 2107 |
is the only SSSE3 instruction that takes three arguments.
|
2100 | 2108 |
|
2101 | 2109 |
|
2102 | |
2.1.18 AMD 3DNow! instructions
|
|
2110 |
2.1.18 AMD 3DNow! instructions
|
2103 | 2111 |
|
2104 | 2112 |
The 3DNow! extension adds a new MMX instructions to those described in 2.1.14,
|
2105 | 2113 |
and introduces operation on the 64-bit packed floating point values, each
|
|
2116 | 2124 |
double word in source operand are used. "pf2iw" converts packed floating
|
2117 | 2125 |
point values to packed word integers, results are extended to double words
|
2118 | 2126 |
using the sign extension. "pfadd" adds packed floating point values. "pfsub"
|
2119 | |
and "pfsubr" substracts packed floating point values, the first one substracts
|
2120 | |
source values from destination values, the second one substracts destination
|
|
2127 |
and "pfsubr" subtracts packed floating point values, the first one subtracts
|
|
2128 |
source values from destination values, the second one subtracts destination
|
2121 | 2129 |
values from the source values. "pfmul" multiplies packed floating point
|
2122 | 2130 |
values. "pfacc" adds the low and high floating point values of the destination
|
2123 | 2131 |
operand, storing the result in the low double word of destination, and adds
|
2124 | 2132 |
the low and high floating point values of the source operand, storing the
|
2125 | |
result in the high double word of destination. "pfnacc" substracts the high
|
|
2133 |
result in the high double word of destination. "pfnacc" subtracts the high
|
2126 | 2134 |
floating point value of the destination operand from the low, storing the
|
2127 | |
result in the low double word of destination, and substracts the high floating
|
|
2135 |
result in the low double word of destination, and subtracts the high floating
|
2128 | 2136 |
point value of the source operand from the low, storing the result in the high
|
2129 | |
double word of destination. "pfpnacc" substracts the high floating point value
|
|
2137 |
double word of destination. "pfpnacc" subtracts the high floating point value
|
2130 | 2138 |
of the destination operand from the low, storing the result in the low double
|
2131 | 2139 |
word of destination, and adds the low and high floating point values of the
|
2132 | 2140 |
source operand, storing the result in the high double word of destination.
|
|
2156 | 2164 |
operands.
|
2157 | 2165 |
|
2158 | 2166 |
|
2159 | |
2.1.19 The x86-64 long mode instructions
|
|
2167 |
2.1.19 The x86-64 long mode instructions
|
2160 | 2168 |
|
2161 | 2169 |
The AMD64 and EM64T architectures (we will use the common name x86-64 for them
|
2162 | 2170 |
both) extend the x86 instruction set for the 64-bit processing. While legacy
|
|
2174 | 2182 |
|
2175 | 2183 |
Table 2.4 New registers in long mode
|
2176 | 2184 |
/--------------------------------------------------\
|
2177 | |
| Type | General | SSE | AVX |
|
|
2185 |
| Type | General | SSE | AVX |
|
2178 | 2186 |
|------|---------------------------|-------|-------|
|
2179 | |
| Bits | 8 | 16 | 32 | 64 | 128 | 256 |
|
|
2187 |
| Bits | 8 | 16 | 32 | 64 | 128 | 256 |
|
2180 | 2188 |
|======|======|======|======|======|=======|=======|
|
2181 | |
| | | | | rax | | |
|
2182 | |
| | | | | rcx | | |
|
2183 | |
| | | | | rdx | | |
|
2184 | |
| | | | | rbx | | |
|
2185 | |
| | spl | | | rsp | | |
|
2186 | |
| | bpl | | | rbp | | |
|
2187 | |
| | sil | | | rsi | | |
|
2188 | |
| | dil | | | rdi | | |
|
2189 | |
| | r8b | r8w | r8d | r8 | xmm8 | ymm8 |
|
2190 | |
| | r9b | r9w | r9d | r9 | xmm9 | ymm9 |
|
2191 | |
| | r10b | r10w | r10d | r10 | xmm10 | ymm10 |
|
2192 | |
| | r11b | r11w | r11d | r11 | xmm11 | ymm11 |
|
2193 | |
| | r12b | r12w | r12d | r12 | xmm12 | ymm12 |
|
2194 | |
| | r13b | r13w | r13d | r13 | xmm13 | ymm13 |
|
2195 | |
| | r14b | r14w | r14d | r14 | xmm14 | ymm14 |
|
2196 | |
| | r15b | r15w | r15d | r15 | xmm15 | ymm15 |
|
|
2189 |
| | | | | rax | | |
|
|
2190 |
| | | | | rcx | | |
|
|
2191 |
| | | | | rdx | | |
|
|
2192 |
| | | | | rbx | | |
|
|
2193 |
| | spl | | | rsp | | |
|
|
2194 |
| | bpl | | | rbp | | |
|
|
2195 |
| | sil | | | rsi | | |
|
|
2196 |
| | dil | | | rdi | | |
|
|
2197 |
| | r8b | r8w | r8d | r8 | xmm8 | ymm8 |
|
|
2198 |
| | r9b | r9w | r9d | r9 | xmm9 | ymm9 |
|
|
2199 |
| | r10b | r10w | r10d | r10 | xmm10 | ymm10 |
|
|
2200 |
| | r11b | r11w | r11d | r11 | xmm11 | ymm11 |
|
|
2201 |
| | r12b | r12w | r12d | r12 | xmm12 | ymm12 |
|
|
2202 |
| | r13b | r13w | r13d | r13 | xmm13 | ymm13 |
|
|
2203 |
| | r14b | r14w | r14d | r14 | xmm14 | ymm14 |
|
|
2204 |
| | r15b | r15w | r15d | r15 | xmm15 | ymm15 |
|
2197 | 2205 |
\--------------------------------------------------/
|
2198 | 2206 |
|
2199 | 2207 |
In general any instruction from x86 architecture, which allowed 16-bit or
|
|
2203 | 2211 |
registers. Below are the samples of new operations possible in long mode on the
|
2204 | 2212 |
example of "mov" instruction:
|
2205 | 2213 |
|
2206 | |
mov rax,r8 ; transfer 64-bit general register
|
|
2214 |
mov rax,r8 ; transfer 64-bit general register
|
2207 | 2215 |
mov al,[rbx] ; transfer memory addressed by 64-bit register
|
2208 | 2216 |
|
2209 | 2217 |
The long mode uses also the instruction pointer based addresses, you can
|
|
2283 | 2291 |
and "wrmsr" instructions.
|
2284 | 2292 |
|
2285 | 2293 |
|
2286 | |
2.1.20 SSE4 instructions
|
|
2294 |
2.1.20 SSE4 instructions
|
2287 | 2295 |
|
2288 | 2296 |
There are actually three different sets of instructions under the name SSE4.
|
2289 | 2297 |
Intel designed two of them, SSE4.1 and SSE4.2, with latter extending the
|
|
2420 | 2428 |
destination operand, the source can be 64-bit memory or SSE register.
|
2421 | 2429 |
|
2422 | 2430 |
pmovzxbq xmm0,word [si] ; zero-extend bytes to quad words
|
2423 | |
pmovsxwq xmm0,xmm1 ; sign-extend words to quad words
|
|
2431 |
pmovsxwq xmm0,xmm1 ; sign-extend words to quad words
|
2424 | 2432 |
|
2425 | 2433 |
"movntdqa" loads double quad word from the source operand to the destination
|
2426 | 2434 |
using a non-temporal hint. The destination operand should be SSE register,
|
|
2450 | 2458 |
also be a 64-bit general purpose register, and the source operand in such case
|
2451 | 2459 |
can be a byte or quad word register or memory location.
|
2452 | 2460 |
|
2453 | |
crc32 eax,dl ; accumulate CRC32 on byte value
|
|
2461 |
crc32 eax,dl ; accumulate CRC32 on byte value
|
2454 | 2462 |
crc32 eax,word [ebx] ; accumulate CRC32 on word value
|
2455 | 2463 |
crc32 rax,qword [rbx] ; accumulate CRC32 on quad word value
|
2456 | 2464 |
|
|
2460 | 2468 |
the same size as source operand. The 64-bit variant is available only in long
|
2461 | 2469 |
mode.
|
2462 | 2470 |
|
2463 | |
popcnt ecx,eax ; count bits set to 1
|
|
2471 |
popcnt ecx,eax ; count bits set to 1
|
2464 | 2472 |
|
2465 | 2473 |
The SSE4a extension, which also includes the "popcnt" instruction introduced
|
2466 | 2474 |
by SSE4.2, at the same time adds the "lzcnt" instruction, which follows the
|
|
2475 | 2483 |
is no third operand in such case), which should contain position value in bits
|
2476 | 2484 |
8-13 and length of bit string in bits 0-5.
|
2477 | 2485 |
|
2478 | |
extrq xmm0,8,7 ; extract 8 bits from position 7
|
2479 | |
extrq xmm0,xmm5 ; extract bits defined by register
|
|
2486 |
extrq xmm0,8,7 ; extract 8 bits from position 7
|
|
2487 |
extrq xmm0,xmm5 ; extract bits defined by register
|
2480 | 2488 |
|
2481 | 2489 |
"insertq" writes the sequence of bits from the low quad word of the source
|
2482 | 2490 |
operand into specified position in low quad word of the destination operand,
|
|
2488 | 2496 |
string in bits 64-69.
|
2489 | 2497 |
|
2490 | 2498 |
insertq xmm1,xmm0,4,2 ; insert 4 bits at position 2
|
2491 | |
insertq xmm1,xmm0 ; insert bits defined by register
|
|
2499 |
insertq xmm1,xmm0 ; insert bits defined by register
|
2492 | 2500 |
|
2493 | 2501 |
"movntss" and "movntsd" store single or double precision floating point
|
2494 | 2502 |
value from the source SSE register into 32-bit or 64-bit destination memory
|
2495 | 2503 |
location respectively, using non-temporal hint.
|
2496 | 2504 |
|
2497 | 2505 |
|
2498 | |
2.1.21 AVX instructions
|
|
2506 |
2.1.21 AVX instructions
|
2499 | 2507 |
|
2500 | 2508 |
The Advanced Vector Extensions introduce instructions that are new variants
|
2501 | 2509 |
of SSE instructions, with new scheme of encoding that allows extended syntax
|
|
2512 | 2520 |
the remaining bits of first source SSE register are copied into the the
|
2513 | 2521 |
destination register.
|
2514 | 2522 |
|
2515 | |
vsubss xmm0,xmm2,xmm3 ; substract two 32-bit floats
|
|
2523 |
vsubss xmm0,xmm2,xmm3 ; subtract two 32-bit floats
|
2516 | 2524 |
vmulsd xmm0,xmm7,qword [esi] ; multiply two 64-bit floats
|
2517 | 2525 |
|
2518 | 2526 |
In case of packed operations, each instruction can also operate on the 256-bit
|
|
2526 | 2534 |
with three operands, however they are only allowed to operate on 128-bit
|
2527 | 2535 |
packed types and thus cannot use the whole AVX registers.
|
2528 | 2536 |
|
2529 | |
vpavgw xmm3,xmm0,xmm2 ; average of 16-bit integers
|
2530 | |
vpslld xmm1,xmm0,1 ; shift double words left
|
|
2537 |
vpavgw xmm3,xmm0,xmm2 ; average of 16-bit integers
|
|
2538 |
vpslld xmm1,xmm0,1 ; shift double words left
|
2531 | 2539 |
|
2532 | 2540 |
If the SSE version of instruction had a syntax with three operands, the third
|
2533 | 2541 |
one being an immediate value, the AVX version of such instruction takes four
|
2534 | 2542 |
operands, with immediate remaining the last one.
|
2535 | 2543 |
|
2536 | 2544 |
vshufpd ymm0,ymm1,ymm2,10010011b ; shuffle 64-bit floats
|
2537 | |
vpalignr xmm0,xmm4,xmm2,3 ; extract byte aligned value
|
|
2545 |
vpalignr xmm0,xmm4,xmm2,3 ; extract byte aligned value
|
2538 | 2546 |
|
2539 | 2547 |
The promotion to new syntax according to the rules described above has been
|
2540 | 2548 |
applied to all the instructions from SSE extensions up to SSE4, with the
|
|
2545 | 2553 |
"vrsqrtps", which can operate on 256-bit data size, but retained the syntax
|
2546 | 2554 |
with only two operands, because they use data from only one source:
|
2547 | 2555 |
|
2548 | |
vsqrtpd ymm1,ymm0 ; put square roots into other register
|
|
2556 |
vsqrtpd ymm1,ymm0 ; put square roots into other register
|
2549 | 2557 |
|
2550 | 2558 |
In a similar way "vroundpd" and "vroundps" retained the syntax with three
|
2551 | |
operands, the last one being immediate value.
|
|
2559 |
operands, the last one being immediate value.
|
2552 | 2560 |
|
2553 | 2561 |
vroundps ymm0,ymm1,0011b ; round toward zero
|
2554 | |
|
|
2562 |
|
2555 | 2563 |
Also some of the operations on packed integers kept their two-operand or
|
2556 | 2564 |
three-operand syntax while being promoted to AVX version. In such case these
|
2557 | 2565 |
instructions follow exactly the same rules for operands as their SSE
|
|
2574 | 2582 |
syntax from SSE without any changes, and also allows a new form with 256-bit
|
2575 | 2583 |
operands in place of 128-bit ones.
|
2576 | 2584 |
|
2577 | |
vmovups [edi],ymm6 ; store unaligned 256-bit data
|
|
2585 |
vmovups [edi],ymm6 ; store unaligned 256-bit data
|
2578 | 2586 |
|
2579 | 2587 |
"vmovddup" has the identical 128-bit syntax as its SSE version, and it also
|
2580 | 2588 |
has a 256-bit version, which stores the duplicates of the lowest quad word
|
|
2600 | 2608 |
either low or high quad word replaced with value from second source (the
|
2601 | 2609 |
memory operand).
|
2602 | 2610 |
|
2603 | |
vmovhps [esi],xmm7 ; store upper half to memory
|
|
2611 |
vmovhps [esi],xmm7 ; store upper half to memory
|
2604 | 2612 |
vmovlps xmm0,xmm7,[ebx] ; low from memory, rest from register
|
2605 | 2613 |
|
2606 | 2614 |
"vmovss" and "vmovsd" have syntax identical to their SSE equivalents as long
|
|
2609 | 2617 |
in destination is then the value copied from first source with lowest data
|
2610 | 2618 |
element replaced with the lowest value from second source.
|
2611 | 2619 |
|
2612 | |
vmovss xmm3,[edi] ; low from memory, rest zeroed
|
|
2620 |
vmovss xmm3,[edi] ; low from memory, rest zeroed
|
2613 | 2621 |
vmovss xmm0,xmm1,xmm2 ; one value from xmm2, three from xmm1
|
2614 | 2622 |
|
2615 | 2623 |
"vcvtss2sd", "vcvtsd2ss", "vcvtsi2ss" and "vcvtsi2d" use the three-operand
|
|
2626 | 2634 |
128-bit memory as source. Analogously "vcvtpd2dq", "vcvttpd2dq" and
|
2627 | 2635 |
"vcvtpd2ps", in addition to variant with syntax identical to SSE version,
|
2628 | 2636 |
allow a variant with SSE register as destination and AVX register or 256-bit
|
2629 | |
memory as source.
|
|
2637 |
memory as source.
|
2630 | 2638 |
"vinsertps", "vpinsrb", "vpinsrw", "vpinsrd", "vpinsrq" and "vpblendw" use
|
2631 | 2639 |
a syntax with four operands, where destination and first source have to be SSE
|
2632 | 2640 |
registers, and the third and fourth operand follow the same rules as second
|
|
2646 | 2654 |
first source with some data elements replaced, according to mask, by values
|
2647 | 2655 |
from the second source.
|
2648 | 2656 |
|
2649 | |
vblendvps ymm3,ymm1,ymm2,ymm7 ; blend according to mask
|
|
2657 |
vblendvps ymm3,ymm1,ymm2,ymm7 ; blend according to mask
|
2650 | 2658 |
|
2651 | 2659 |
"vptest" allows the same syntax as its SSE version and also has a 256-bit
|
2652 | 2660 |
version, with both operands doubled in size. There are also two new
|
|
2656 | 2664 |
"vptest".
|
2657 | 2665 |
|
2658 | 2666 |
vptest ymm0,yword [ebx] ; test 256-bit values
|
2659 | |
vtestpd xmm0,xmm1 ; test sign bits of 64-bit floats
|
|
2667 |
vtestpd xmm0,xmm1 ; test sign bits of 64-bit floats
|
2660 | 2668 |
|
2661 | 2669 |
"vbroadcastss", "vbroadcastsd" and "vbroadcastf128" are new instructions,
|
2662 | 2670 |
which broadcast the data element defined by source operand into all elements
|
|
2666 | 2674 |
destination. "vbroadcastf128" requires 128-bit memory as source, and AVX
|
2667 | 2675 |
register as destination.
|
2668 | 2676 |
|
2669 | |
vbroadcastss ymm0,dword [eax] ; get eight copies of value
|
|
2677 |
vbroadcastss ymm0,dword [eax] ; get eight copies of value
|
2670 | 2678 |
|
2671 | 2679 |
"vinsertf128" is the new instruction, which takes four operands. The
|
2672 | 2680 |
destination and first source have to be AVX registers, second source can be
|
|
2687 | 2695 |
data (AVX registers). Either destination or second source has to be a memory
|
2688 | 2696 |
location of appropriate size, the two other operands should be registers.
|
2689 | 2697 |
|
2690 | |
vmaskmovps [edi],xmm0,xmm5 ; conditionally store
|
2691 | |
vmaskmovpd ymm5,ymm0,[esi] ; conditionally load
|
|
2698 |
vmaskmovps [edi],xmm0,xmm5 ; conditionally store
|
|
2699 |
vmaskmovpd ymm5,ymm0,[esi] ; conditionally load
|
2692 | 2700 |
|
2693 | 2701 |
"vpermilpd" and "vpermilps" are the new instructions with three operands
|
2694 | 2702 |
that permute the values from first source according to the control fields from
|
|
2713 | 2721 |
instructions. The rules for their operands remain unchanged.
|
2714 | 2722 |
|
2715 | 2723 |
|
2716 | |
2.1.22 AVX2 instructions
|
|
2724 |
2.1.22 AVX2 instructions
|
2717 | 2725 |
|
2718 | 2726 |
The AVX2 extension allows all the AVX instructions operating on packed integers
|
2719 | 2727 |
to use 256-bit data types, and introduces some new instructions as well.
|
|
2722 | 2730 |
rules became analogous to AVX instructions operating on packed floating point
|
2723 | 2731 |
types.
|
2724 | 2732 |
|
2725 | |
vpsubb ymm0,ymm0,[esi] ; substract 32 packed bytes
|
|
2733 |
vpsubb ymm0,ymm0,[esi] ; subtract 32 packed bytes
|
2726 | 2734 |
vpavgw ymm3,ymm0,ymm2 ; average of 16-bit integers
|
2727 | 2735 |
|
2728 | 2736 |
However there are some instructions that have not been equipped with the
|
|
2734 | 2742 |
amount to be SSE register or 128-bit memory location, use the same rules
|
2735 | 2743 |
for the third operand in their 256-bit variant.
|
2736 | 2744 |
|
2737 | |
vpsllw ymm2,ymm2,xmm4 ; shift words left
|
|
2745 |
vpsllw ymm2,ymm2,xmm4 ; shift words left
|
2738 | 2746 |
vpsrad ymm0,ymm3,xword [ebx] ; shift double words right
|
2739 | 2747 |
|
2740 | 2748 |
There are also new packed shift instructions with standard three-operand AVX
|
|
2749 | 2757 |
256-bit variant need memory of that size doubled or SSE register as source and
|
2750 | 2758 |
AVX register as destination.
|
2751 | 2759 |
|
2752 | |
vpmovzxbq ymm0,dword [esi] ; bytes to quad words
|
|
2760 |
vpmovzxbq ymm0,dword [esi] ; bytes to quad words
|
2753 | 2761 |
|
2754 | 2762 |
Also "vmovntdqa" has been upgraded with 256-bit variant, so it allows to
|
2755 | 2763 |
transfer 256-bit value from memory to AVX register, it needs memory address
|
|
2771 | 2779 |
element.
|
2772 | 2780 |
|
2773 | 2781 |
vpbroadcastb ymm0,byte [ebx] ; get 32 identical bytes
|
2774 | |
|
|
2782 |
|
2775 | 2783 |
"vpermd" and "vpermps" are new three-operand instructions, which use each
|
2776 | 2784 |
32-bit element from first source as an index of element in second source which
|
2777 | 2785 |
is copied into destination at position corresponding to element containing
|
|
2781 | 2789 |
indexes from the immediate value specified as third operand to determine which
|
2782 | 2790 |
element from source store at given position in destination. The destination
|
2783 | 2791 |
has to be AVX register, source can be AVX register or 256-bit memory, and the
|
2784 | |
third operand must be 8-bit immediate value.
|
|
2792 |
third operand must be 8-bit immediate value.
|
2785 | 2793 |
The family of new instructions performing "gather" operation have special
|
2786 | 2794 |
syntax, as in their memory operand they use addressing mode that is unique to
|
2787 | 2795 |
them. The base of address can be a 32-bit or 64-bit general purpose register
|
|
2837 | 2845 |
respectively.
|
2838 | 2846 |
|
2839 | 2847 |
|
2840 | |
2.1.23 Auxiliary sets of computational instructions
|
|
2848 |
2.1.23 Auxiliary sets of computational instructions
|
2841 | 2849 |
|
2842 | 2850 |
There is a number of additional instruction set extensions related to
|
2843 | 2851 |
AVX. They introduce new vector instructions (and sometimes also their SSE
|
|
2884 | 2892 |
The mnemonic of FMA instruction is obtained by appending to "vf" prefix: first
|
2885 | 2893 |
either "m" or "nm" to select whether result of multiplication should be taken
|
2886 | 2894 |
as-is or negated, then either "add" or "sub" to select whether third value
|
2887 | |
will be added to the product or substracted from the product, then either
|
|
2895 |
will be added to the product or subtracted from the product, then either
|
2888 | 2896 |
"132", "213" or "231" to select which source operands are multiplied and which
|
2889 | |
one is added or substracted, and finally the type of data on which the
|
|
2897 |
one is added or subtracted, and finally the type of data on which the
|
2890 | 2898 |
instruction operates, either "ps", "pd", "ss" or "sd". As it was with SSE
|
2891 | 2899 |
instructions promoted to AVX, instructions operating on packed floating point
|
2892 | 2900 |
values allow 128-bit or 256-bit syntax, in former all the operands are SSE
|
|
2896 | 2904 |
SSE registers, and the third operand can also be a memory, either 32-bit for
|
2897 | 2905 |
single precision or 64-bit for double precision.
|
2898 | 2906 |
|
2899 | |
vfmsub231ps ymm1,ymm2,ymm3 ; multiply and substract
|
2900 | |
vfnmadd132sd xmm0,xmm5,[ebx] ; multiply, negate and add
|
|
2907 |
vfmsub231ps ymm1,ymm2,ymm3 ; multiply and subtract
|
|
2908 |
vfnmadd132sd xmm0,xmm5,[ebx] ; multiply, negate and add
|
2901 | 2909 |
|
2902 | 2910 |
In addition to the instructions created by the rule described above, there are
|
2903 | 2911 |
families of instructions with mnemonics starting with either "vfmaddsub" or
|
2904 | 2912 |
"vfmsubadd", followed by either "132", "213" or "231" and then either "ps" or
|
2905 | 2913 |
"pd" (the operation must always be on packed values in this case). They add
|
2906 | |
to the result of multiplication or substract from it depending on the position
|
|
2914 |
to the result of multiplication or subtract from it depending on the position
|
2907 | 2915 |
of value in packed data - instructions from the "vfmaddsub" group add when the
|
2908 | |
position is odd and substract when the position is even, instructions from the
|
|
2916 |
position is odd and subtract when the position is even, instructions from the
|
2909 | 2917 |
"vfmsubadd" group add when the position is even and subtstract when the
|
2910 | 2918 |
position is odd. The rules for operands are the same as for other FMA
|
2911 | 2919 |
instructions.
|
|
2915 | 2923 |
out, as having separate destination operand makes such selection of operands
|
2916 | 2924 |
superfluous. The multiplication is always performed on values from the first
|
2917 | 2925 |
and second source, and then the value from third source is added or
|
2918 | |
substracted. Either second or third source can be a memory operand, and the
|
|
2926 |
subtracted. Either second or third source can be a memory operand, and the
|
2919 | 2927 |
rules for the sizes of operands are the same as for FMA instructions.
|
2920 | 2928 |
|
2921 | |
vfmaddpd ymm0,ymm1,[esi],ymm2 ; multiply and add
|
2922 | |
vfmsubss xmm0,xmm1,xmm2,[ebx] ; multiply and substract
|
|
2929 |
vfmaddpd ymm0,ymm1,[esi],ymm2 ; multiply and add
|
|
2930 |
vfmsubss xmm0,xmm1,xmm2,[ebx] ; multiply and subtract
|
2923 | 2931 |
|
2924 | 2932 |
The F16C extension consists of two instructions, "vcvtps2ph" and
|
2925 | 2933 |
"vcvtph2ps", which convert floating point values between single precision and
|
|
2942 | 2950 |
on a solitary double precision value and 32-bit for operation on a solitary
|
2943 | 2951 |
single precision value).
|
2944 | 2952 |
|
2945 | |
vfrczps ymm0,[esi] ; load fractional parts
|
|
2953 |
vfrczps ymm0,[esi] ; load fractional parts
|
2946 | 2954 |
|
2947 | 2955 |
"vpcmov" copies bits from either first or second source into destination
|
2948 | 2956 |
depending on the values of corresponding bits in the fourth operand (the
|
|
2970 | 2978 |
of comparison encoded within the instruction name by inserting the comparison
|
2971 | 2979 |
mnemonic after "vpcom".
|
2972 | 2980 |
|
2973 | |
vpcomb xmm0,xmm1,xmm2,4 ; test for equal bytes
|
2974 | |
vpcomgew xmm0,xmm1,[ebx] ; compare signed words
|
|
2981 |
vpcomb xmm0,xmm1,xmm2,4 ; test for equal bytes
|
|
2982 |
vpcomgew xmm0,xmm1,[ebx] ; compare signed words
|
2975 | 2983 |
|
2976 | 2984 |
Table 2.5 XOP comparisons
|
2977 | 2985 |
/-------------------------------------------\
|
2978 | |
| Code | Mnemonic | Description |
|
|
2986 |
| Code | Mnemonic | Description |
|
2979 | 2987 |
|======|==========|=========================|
|
2980 | |
| 0 | lt | less than |
|
2981 | |
| 1 | le | less than or equal |
|
2982 | |
| 2 | gt | greater than |
|
2983 | |
| 3 | ge | greater than or equal |
|
2984 | |
| 4 | eq | equal |
|
2985 | |
| 5 | neq | not equal |
|
2986 | |
| 6 | false | false |
|
2987 | |
| 7 | true | true |
|
|
2988 |
| 0 | lt | less than |
|
|
2989 |
| 1 | le | less than or equal |
|
|
2990 |
| 2 | gt | greater than |
|
|
2991 |
| 3 | ge | greater than or equal |
|
|
2992 |
| 4 | eq | equal |
|
|
2993 |
| 5 | neq | not equal |
|
|
2994 |
| 6 | false | false |
|
|
2995 |
| 7 | true | true |
|
2988 | 2996 |
\-------------------------------------------/
|
2989 | 2997 |
|
2990 | 2998 |
"vpermil2ps" and "vpermil2pd" set the elements in destination register to
|
|
3006 | 3014 |
64-bit results, "vphaddwd" and "vphadduwd" add pairs of words to 32-bit
|
3007 | 3015 |
results, "vphaddwq" and "vphadduwq" sum all words in each four-word block to
|
3008 | 3016 |
64-bit results, "vphadddq" and "vphaddudq" add pairs of double words to 64-bit
|
3009 | |
results. "vphsubbw" substracts in each two-byte block the byte at higher
|
|
3017 |
results. "vphsubbw" subtracts in each two-byte block the byte at higher
|
3010 | 3018 |
position from the one at lower position, and stores the result as a signed
|
3011 | 3019 |
16-bit value at the corresponding position in destination, "vphsubwd"
|
3012 | |
substracts in each two-word block the word at higher position from the one at
|
3013 | |
lower position and makes signed 32-bit results, "vphsubdq" substract in each
|
|
3020 |
subtracts in each two-word block the word at higher position from the one at
|
|
3021 |
lower position and makes signed 32-bit results, "vphsubdq" subtract in each
|
3014 | 3022 |
block of two double word the one at higher position from the one at lower
|
3015 | 3023 |
position and makes signed 64-bit results. Each of these instructions takes
|
3016 | 3024 |
two operands, the destination being SSE register, and the source being SSE
|
3017 | 3025 |
register or 128-bit memory.
|
3018 | 3026 |
|
3019 | |
vphadduwq xmm0,xmm1 ; sum quadruplets of words
|
|
3027 |
vphadduwq xmm0,xmm1 ; sum quadruplets of words
|
3020 | 3028 |
|
3021 | 3029 |
"vpmacsww" and "vpmacssww" multiply the corresponding signed 16-bit values
|
3022 | 3030 |
from the first and second source and then add the products to the parallel
|
|
3054 | 3062 |
memory (or they can be SSE registers both) and the other operands have to be
|
3055 | 3063 |
SSE registers.
|
3056 | 3064 |
|
3057 | |
vpshld xmm3,xmm1,[ebx] ; shift bytes from xmm1
|
|
3065 |
vpshld xmm3,xmm1,[ebx] ; shift bytes from xmm1
|
3058 | 3066 |
|
3059 | 3067 |
"vpshab", "vpshaw", "vpshad" and "vpshaq" arithmetically shift bytes, words,
|
3060 | 3068 |
double words or quad words. These instructions follow the same rules as the
|
|
3063 | 3071 |
shifts, but additionally allow third operand to be immediate value, in which
|
3064 | 3072 |
case the same amount of rotation is specified for all the elements in source.
|
3065 | 3073 |
|
3066 | |
vprotb xmm0,[esi],3 ; rotate bytes to the left
|
|
3074 |
vprotb xmm0,[esi],3 ; rotate bytes to the left
|
3067 | 3075 |
|
3068 | 3076 |
The MOVBE extension introduces just one new instruction, "movbe", which
|
3069 | 3077 |
swaps bytes in value from source before storing it in destination, so can
|
|
3081 | 3089 |
the first source have to be general registers, the second source can be
|
3082 | 3090 |
general register or memory.
|
3083 | 3091 |
|
3084 | |
andn edx,eax,[ebx] ; bit-multiply inverted eax with memory
|
|
3092 |
andn edx,eax,[ebx] ; bit-multiply inverted eax with memory
|
3085 | 3093 |
|
3086 | 3094 |
"bextr" extracts from the first source the sequence of bits using an index
|
3087 | 3095 |
and length specified by bit fields in the second source operand and stores
|
|
3096 | 3104 |
bits in destination to zero. The destination must be a general register,
|
3097 | 3105 |
the source can be general register or memory.
|
3098 | 3106 |
|
3099 | |
blsi rax,r11 ; isolate the lowest set bit
|
|
3107 |
blsi rax,r11 ; isolate the lowest set bit
|
3100 | 3108 |
|
3101 | 3109 |
"blsmsk" sets all the bits in the destination up to the lowest set bit in
|
3102 | 3110 |
the source, including this bit. "blsr" copies all the bits from the source to
|
|
3114 | 3122 |
"pdep" performs the reverse operation - it takes sequence of bits from the
|
3115 | 3123 |
first source and puts them consecutively at the positions where the bits in
|
3116 | 3124 |
second source are set, setting all the other bits in destination to zero.
|
3117 | |
These BMI2 instructions follow the same rules for operands as "andn".
|
|
3125 |
These BMI2 instructions follow the same rules for operands as "andn".
|
3118 | 3126 |
"mulx" is a BMI2 instruction which performs an unsigned multiplication of
|
3119 | 3127 |
value from EDX or RDX register (depending on the size of specified operands)
|
3120 | 3128 |
by the value from third operand, and stores the low half of result in the
|
|
3122 | 3130 |
it without affecting the flags. The third operand can be general register or
|
3123 | 3131 |
memory, and both the destination operands have to be general registers.
|
3124 | 3132 |
|
3125 | |
mulx edx,eax,ecx ; multiply edx by ecx into edx:eax
|
|
3133 |
mulx edx,eax,ecx ; multiply edx by ecx into edx:eax
|
3126 | 3134 |
|
3127 | 3135 |
"shlx", "shrx" and "sarx" are BMI2 instructions, which perform logical or
|
3128 | 3136 |
arithmetical shifts of value from first source by the amount specified by
|
|
3134 | 3142 |
has to be general register, the source operand can be general register or
|
3135 | 3143 |
memory, and the third operand has to be an immediate value.
|
3136 | 3144 |
|
3137 | |
rorx eax,edx,7 ; rotate without affecting flags
|
3138 | |
|
|
3145 |
rorx eax,edx,7 ; rotate without affecting flags
|
|
3146 |
|
3139 | 3147 |
The TBM is an extension designed by AMD to supplement the BMI set. The
|
3140 | 3148 |
"bextr" instruction is extended with a new form, in which second source is
|
3141 | 3149 |
a 32-bit immediate value. "blsic" is a new instruction which performs the
|
|
3150 | 3158 |
"tzmsk" finds the lowest set bit in value from source operand, sets all bits
|
3151 | 3159 |
below it to 1 and all the rest of bits to zero, then writes the result to
|
3152 | 3160 |
destination. "t1mskc" finds the least significant zero bit in the value from
|
3153 | |
source operand, sets the bits below it to zero and all the other bits to 1,
|
|
3161 |
source operand, sets the bits below it to zero and all the other bits to 1,
|
3154 | 3162 |
and writes the result to destination. These instructions have the same rules
|
3155 | 3163 |
for operands as "blsi".
|
3156 | 3164 |
|
3157 | 3165 |
|
3158 | |
2.1.24 AVX-512 instructions
|
3159 | |
|
3160 | |
[This section has not been written yet.]
|
3161 | |
|
3162 | |
|
3163 | |
2.1.25 Other extensions of instruction set
|
|
3166 |
2.1.24 AVX-512 instructions
|
|
3167 |
|
|
3168 |
The AVX-512 introduces 512-bit vector registers, which extend the 256-bit
|
|
3169 |
registers used by AVX and AVX2. It also extends the set of vector registers
|
|
3170 |
from 16 to 32, with the additional registers "zmm16" to "zmm31", their low
|
|
3171 |
256-bit portions "ymm16" to "ymm31" and their low 128-bit portions "xmm16"
|
|
3172 |
to "xmm31". These additional registers can only be accessed in the long mode.
|
|
3173 |
|
|
3174 |
Table 2.6 New registers available in long mode with AVX-512
|
|
3175 |
/------------------------------------------------------------------\
|
|
3176 |
| Size | Registers |
|
|
3177 |
|---------|--------------------------------------------------------|
|
|
3178 |
| 128-bit | xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 |
|
|
3179 |
| | xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 |
|
|
3180 |
|---------|--------------------------------------------------------|
|
|
3181 |
| 256-bit | ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 |
|
|
3182 |
| | ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 |
|
|
3183 |
|---------|--------------------------------------------------------|
|
|
3184 |
| 512-bit | zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 |
|
|
3185 |
| | zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 |
|
|
3186 |
\------------------------------------------------------------------/
|
|
3187 |
|
|
3188 |
In addition to new operand sizes and registers, the AVX-512 introduces
|
|
3189 |
a number of supplementary settings that can be included in the operands
|
|
3190 |
of AVX instructions.
|
|
3191 |
The destination operand of the most of AVX instructions can be followed
|
|
3192 |
by the name of an opmask register enclosed in braces, this modifier
|
|
3193 |
specifies a mask that decides which units of data in the destination
|
|
3194 |
operand are going to be updated. The "k0" register cannot be used as a
|
|
3195 |
destination mask. This setting can be further followed by "{z}" modifier
|
|
3196 |
to choose that the data units not selected by mask should be zeroed
|
|
3197 |
instead of leaving them unchanged.
|
|
3198 |
|
|
3199 |
vaddpd zmm1{k1},zmm5,zword [rsi] ; update selected floats
|
|
3200 |
vaddps ymm6{k1}{z},ymm12,ymm24 ; update selected, zero other ones
|
|
3201 |
|
|
3202 |
When an instruction that operates on packed data has a source operand
|
|
3203 |
loaded from a memory, the memory location may be just a single unit of data
|
|
3204 |
and the source used for the operation is created by broadcasting this
|
|
3205 |
value into all the units within the required size. To specify that such
|
|
3206 |
broadcasting method is used the memory operand should be followed by one
|
|
3207 |
of the "{1to2}", "{1to4}", "{1to8}", "{1to16}", "{1to32}" and "{1to64}"
|
|
3208 |
modifiers, selecting the appropriate multiply of a unit.
|
|
3209 |
|
|
3210 |
vsubps zmm1,zmm2,dword [rsi] {1to16} ; subtract from all floats
|
|
3211 |
|
|
3212 |
When an instruction does not use a memory operand often an additional
|
|
3213 |
operand may follow the source operands, containing the rounding mode
|
|
3214 |
specifier. When an instruction has variants that operate on different
|
|
3215 |
sizes of data, the rounding mode can be specified only when the
|
|
3216 |
register operands are 512-bit.
|
|
3217 |
|
|
3218 |
vdivps zmm2,zmm3,zmm5,{ru-sae} ; round results up
|
|
3219 |
|
|
3220 |
Table 2.7 AVX-512 rounding modes
|
|
3221 |
/----------------------------------------------------------\
|
|
3222 |
| Operand | Description |
|
|
3223 |
|==========|===============================================|
|
|
3224 |
| {rn-sae} | round to nearest and suppress all exceptions |
|
|
3225 |
| {rd-sae} | round down and suppress all exceptions |
|
|
3226 |
| {ru-sae} | round up and suppress all exceptions |
|
|
3227 |
| {rz-sae} | round toward zero and suppress all exceptions |
|
|
3228 |
\----------------------------------------------------------/
|
|
3229 |
|
|
3230 |
Some of the instructions do not use a rounding mode but still allow
|
|
3231 |
to specify the exception suppression option with "{sae}" modifier in the
|
|
3232 |
additional operand.
|
|
3233 |
|
|
3234 |
vmaxpd zmm0,zmm1,zmm2,{sae} ; suppress all exceptions
|
|
3235 |
|
|
3236 |
The family of "gather" instructions in their AVX-512 variants use a new
|
|
3237 |
syntax with only two operands. The opmask register takes the role which
|
|
3238 |
way played by the third operand in the AVX2 syntax and it is mandatory
|
|
3239 |
in this case.
|
|
3240 |
|
|
3241 |
vgatherdps xmm0{k1},[eax+xmm1] ; gather four floats
|
|
3242 |
vgatherdpd zmm0{k3},[ymm3*8] ; gather eight doubles
|
|
3243 |
|
|
3244 |
The new family of "scatter" instructions perform an operation reverse to
|
|
3245 |
the one of "gather". They also take two operands, the destination is a
|
|
3246 |
memory with vector indexing and opmask modifier, and the source is a vector
|
|
3247 |
register.
|
|
3248 |
|
|
3249 |
vscatterdps [eax+xmm1]{k1},xmm0 ; scatter four floats
|
|
3250 |
vscatterdpd [ymm3*8]{k3},zmm0 ; scatter eight doubles
|
|
3251 |
|
|
3252 |
|
|
3253 |
2.1.25 Other extensions of instruction set
|
3164 | 3254 |
|
3165 | 3255 |
There is a number of additional instruction set extensions recognized by flat
|
3166 | 3256 |
assembler, and the general syntax of the instructions introduced by those
|
3167 | 3257 |
extensions is provided here. For a detailed information on the operations
|
3168 | 3258 |
performed by them, check out the manuals from Intel (for the VMX, SMX, XSAVE,
|
3169 | |
RDRAND, FSGSBASE, INVPCID, HLE and RTM extensions) or AMD (for the SVM
|
|
3259 |
RDRAND, FSGSBASE, INVPCID, HLE, RTM, and MPX extensions) or AMD (for the SVM
|
3170 | 3260 |
extension).
|
3171 | 3261 |
The Virtual-Machine Extensions (VMX) provide a set of instructions for the
|
3172 | 3262 |
management of virtual machines. The "vmxon" instruction, which enters the VMX
|
|
3257 | 3347 |
an 8-bit immediate value as its only operand, this value is passed in the
|
3258 | 3348 |
highest bits of EAX to the fallback routine. "xtest" checks whether there is
|
3259 | 3349 |
transactional execution in progress, this instruction takes no operands.
|
|
3350 |
The MPX extension adds instructions that operate on new bounds registers
|
|
3351 |
and aid in checking the memory references. For some of these instructions
|
|
3352 |
flat assemblers allows a special syntax that allows a fine control over their
|
|
3353 |
operation, where an address of a memory operand is separated into two parts
|
|
3354 |
with a comma. With "bndmk" instruction the first part of such address specifies
|
|
3355 |
the lower bound and the second one the upper bound. The lower bound can be
|
|
3356 |
either zero or a register, the upper bound can be any address that uses no more
|
|
3357 |
than one register (multiplied by 1, 2, 4, or 8). The addressing registers need
|
|
3358 |
to be 64-bit when in long mode, and 32-bit otherwise.
|
|
3359 |
|
|
3360 |
bndmk bnd0,[rbx,100000h] ; lower bound in register, upper directly
|
|
3361 |
bndmk bnd1,[0,rbx] ; lower bound zero, upper in register
|
|
3362 |
|
|
3363 |
In case of "bndldx" and "bndstx", the first part of memory operand specifies an
|
|
3364 |
address used to access a bound table entry, while the second part is either zero
|
|
3365 |
or a register that plays a role of an additional operand for such instruction.
|
|
3366 |
The address in the first part may use no more than one register and the register
|
|
3367 |
cannot be multiplied by a number other than 1.
|
|
3368 |
|
|
3369 |
bndstx [rcx,rsi],bnd3 ; store bnd3 and rsi at rcx in the bound table
|
|
3370 |
bndldx bnd2,[rcx,rsi] ; load from bound table if entry matches rsi
|
3260 | 3371 |
|
3261 | 3372 |
|
3262 | 3373 |
2.2 Control directives
|
|
3357 | 3468 |
defined somewhere in source:
|
3358 | 3469 |
|
3359 | 3470 |
if count>0
|
3360 | |
mov cx,count
|
3361 | |
rep movsb
|
|
3471 |
mov cx,count
|
|
3472 |
rep movsb
|
3362 | 3473 |
end if
|
3363 | 3474 |
|
3364 | 3475 |
These two assembly instructions will be assembled only if the "count" constant
|
3365 | 3476 |
is greater than 0. The next sample shows more complex conditional structure:
|
3366 | 3477 |
|
3367 | 3478 |
if count & ~ count mod 4
|
3368 | |
mov cx,count/4
|
3369 | |
rep movsd
|
|
3479 |
mov cx,count/4
|
|
3480 |
rep movsd
|
3370 | 3481 |
else if count>4
|
3371 | |
mov cx,count/4
|
3372 | |
rep movsd
|
3373 | |
mov cx,count mod 4
|
3374 | |
rep movsb
|
|
3482 |
mov cx,count/4
|
|
3483 |
rep movsd
|
|
3484 |
mov cx,count mod 4
|
|
3485 |
rep movsb
|
3375 | 3486 |
else
|
3376 | |
mov cx,count
|
3377 | |
rep movsb
|
|
3487 |
mov cx,count
|
|
3488 |
rep movsb
|
3378 | 3489 |
end if
|
3379 | 3490 |
|
3380 | 3491 |
The first block of instructions gets assembled when the "count" is non zero and
|
|
3422 | 3533 |
for example:
|
3423 | 3534 |
|
3424 | 3535 |
repeat 8
|
3425 | |
mov byte [bx],%
|
3426 | |
inc bx
|
|
3536 |
mov byte [bx],%
|
|
3537 |
inc bx
|
3427 | 3538 |
end repeat
|
3428 | 3539 |
|
3429 | 3540 |
The generated code will store byte values from one to eight in the memory
|
|
3436 | 3547 |
|
3437 | 3548 |
s = x/2
|
3438 | 3549 |
repeat 100
|
3439 | |
if x/s = s
|
3440 | |
break
|
3441 | |
end if
|
3442 | |
s = (s+x/s)/2
|
|
3550 |
if x/s = s
|
|
3551 |
break
|
|
3552 |
end if
|
|
3553 |
s = (s+x/s)/2
|
3443 | 3554 |
end repeat
|
3444 | 3555 |
|
3445 | 3556 |
The "while" directive repeats the block of instructions as long as the
|
|
3454 | 3565 |
|
3455 | 3566 |
s = x/2
|
3456 | 3567 |
while x/s <> s
|
3457 | |
s = (s+x/s)/2
|
3458 | |
if % = 100
|
3459 | |
break
|
3460 | |
end if
|
|
3568 |
s = (s+x/s)/2
|
|
3569 |
if % = 100
|
|
3570 |
break
|
|
3571 |
end if
|
3461 | 3572 |
end while
|
3462 | 3573 |
|
3463 | 3574 |
The blocks defined with "if", "repeat" and "while" can be nested in any
|
|
3503 | 3614 |
generated in current addressing space you can use such block of directives:
|
3504 | 3615 |
|
3505 | 3616 |
repeat $-$$
|
3506 | |
load a byte from $$+%-1
|
3507 | |
store byte a xor c at $$+%-1
|
|
3617 |
load a byte from $$+%-1
|
|
3618 |
store byte a xor c at $$+%-1
|
3508 | 3619 |
end repeat
|
3509 | 3620 |
|
3510 | 3621 |
and each byte of code will be xored with the value defined by "c" constant.
|
|
3521 | 3632 |
|
3522 | 3633 |
GDTR dp ?
|
3523 | 3634 |
virtual at GDTR
|
3524 | |
GDT_limit dw ?
|
3525 | |
GDT_address dd ?
|
|
3635 |
GDT_limit dw ?
|
|
3636 |
GDT_address dd ?
|
3526 | 3637 |
end virtual
|
3527 | 3638 |
|
3528 | 3639 |
It defines two labels for parts of the 48-bit variable at "GDTR" address.
|
|
3530 | 3641 |
register, for example:
|
3531 | 3642 |
|
3532 | 3643 |
virtual at bx
|
3533 | |
LDT_limit dw ?
|
3534 | |
LDT_address dd ?
|
|
3644 |
LDT_limit dw ?
|
|
3645 |
LDT_address dd ?
|
3535 | 3646 |
end virtual
|
3536 | 3647 |
|
3537 | 3648 |
With such definition instruction "mov ax,[LDT_limit]" will be assembled
|
|
3544 | 3655 |
example:
|
3545 | 3656 |
|
3546 | 3657 |
virtual at 0
|
3547 | |
xor eax,eax
|
3548 | |
and edx,eax
|
3549 | |
load zeroq dword from 0
|
|
3658 |
xor eax,eax
|
|
3659 |
and edx,eax
|
|
3660 |
load zeroq dword from 0
|
3550 | 3661 |
end virtual
|
3551 | 3662 |
|
3552 | 3663 |
The above piece of code will define the "zeroq" constant containing four bytes
|
|
3555 | 3666 |
For example this code:
|
3556 | 3667 |
|
3557 | 3668 |
virtual at 0
|
3558 | |
file 'a.txt':10h,1
|
3559 | |
load char from 0
|
|
3669 |
file 'a.txt':10h,1
|
|
3670 |
load char from 0
|
3560 | 3671 |
end virtual
|
3561 | 3672 |
|
3562 | 3673 |
loads the single byte from offset 10h in file "a.txt" into the "char"
|
|
3576 | 3687 |
has been closed:
|
3577 | 3688 |
|
3578 | 3689 |
virtual at 0
|
3579 | |
hex_digits::
|
3580 | |
db '0123456789ABCDEF'
|
|
3690 |
hex_digits::
|
|
3691 |
db '0123456789ABCDEF'
|
3581 | 3692 |
end virtual
|
3582 | 3693 |
load a byte from hex_digits:10
|
3583 | 3694 |
|
|
3604 | 3715 |
create the alignment yourself, like:
|
3605 | 3716 |
|
3606 | 3717 |
virtual
|
3607 | |
align 16
|
3608 | |
a = $ - $$
|
|
3718 |
align 16
|
|
3719 |
a = $ - $$
|
3609 | 3720 |
end virtual
|
3610 | 3721 |
db a dup 0
|
3611 | 3722 |
|
|
3619 | 3730 |
bits = 16
|
3620 | 3731 |
display 'Current offset is 0x'
|
3621 | 3732 |
repeat bits/4
|
3622 | |
d = '0' + $ shr (bits-%*4) and 0Fh
|
3623 | |
if d > '9'
|
3624 | |
d = d + 'A'-'9'-1
|
3625 | |
end if
|
3626 | |
display d
|
|
3733 |
d = '0' + $ shr (bits-%*4) and 0Fh
|
|
3734 |
if d > '9'
|
|
3735 |
d = d + 'A'-'9'-1
|
|
3736 |
end if
|
|
3737 |
display d
|
3627 | 3738 |
end repeat
|
3628 | 3739 |
display 13,10
|
3629 | 3740 |
|
|
3673 | 3784 |
Consider the following example:
|
3674 | 3785 |
|
3675 | 3786 |
if ~ defined alpha
|
3676 | |
alpha:
|
|
3787 |
alpha:
|
3677 | 3788 |
end if
|
3678 | 3789 |
|
3679 | 3790 |
The "defined" operator gives the true value when the expression following it
|
|
3695 | 3806 |
condition may make it possible to get it resolved:
|
3696 | 3807 |
|
3697 | 3808 |
if ~ defined alpha | defined @f
|
3698 | |
alpha:
|
3699 | |
@@:
|
|
3809 |
alpha:
|
|
3810 |
@@:
|
3700 | 3811 |
end if
|
3701 | 3812 |
|
3702 | 3813 |
The "@f" is always the same label as the nearest "@@" symbol in the source
|
|
3708 | 3819 |
look at the blocks that has nothing more than this self-establishing:
|
3709 | 3820 |
|
3710 | 3821 |
if defined @f
|
3711 | |
@@:
|
|
3822 |
@@:
|
3712 | 3823 |
end if
|
3713 | 3824 |
|
3714 | 3825 |
This is an example of source that may have more than one solution, as both
|
|
3881 | 3992 |
|
3882 | 3993 |
macro stos0
|
3883 | 3994 |
{
|
3884 | |
xor al,al
|
3885 | |
stosb
|
|
3995 |
xor al,al
|
|
3996 |
stosb
|
3886 | 3997 |
}
|
3887 | 3998 |
|
3888 | 3999 |
The macroinstruction "stos0" will be replaced with these two assembly
|
|
3908 | 4019 |
macro mov op1,op2
|
3909 | 4020 |
{
|
3910 | 4021 |
if op1 in <ds,es,fs,gs,ss> & op2 in <cs,ds,es,fs,gs,ss>
|
3911 | |
push op2
|
3912 | |
pop op1
|
|
4022 |
push op2
|
|
4023 |
pop op1
|
3913 | 4024 |
else
|
3914 | |
mov op1,op2
|
|
4025 |
mov op1,op2
|
3915 | 4026 |
end if
|
3916 | 4027 |
}
|
3917 | 4028 |
|
|
3924 | 4035 |
macro mov op1,op2,op3
|
3925 | 4036 |
{
|
3926 | 4037 |
if op3 eq
|
3927 | |
mov op1,op2
|
|
4038 |
mov op1,op2
|
3928 | 4039 |
else
|
3929 | |
mov op1,op2
|
3930 | |
mov op2,op3
|
|
4040 |
mov op1,op2
|
|
4041 |
mov op2,op3
|
3931 | 4042 |
end if
|
3932 | 4043 |
}
|
3933 | 4044 |
|
|
3971 | 4082 |
|
3972 | 4083 |
macro stoschar [char]
|
3973 | 4084 |
{
|
3974 | |
mov al,char
|
3975 | |
stosb
|
|
4085 |
mov al,char
|
|
4086 |
stosb
|
3976 | 4087 |
}
|
3977 | 4088 |
|
3978 | 4089 |
This macroinstruction accepts unlimited number of arguments, and each one
|
|
3997 | 4108 |
|
3998 | 4109 |
macro movstr
|
3999 | 4110 |
{
|
4000 | |
local move
|
|
4111 |
local move
|
4001 | 4112 |
move:
|
4002 | |
lodsb
|
4003 | |
stosb
|
4004 | |
test al,al
|
4005 | |
jnz move
|
|
4113 |
lodsb
|
|
4114 |
stosb
|
|
4115 |
test al,al
|
|
4116 |
jnz move
|
4006 | 4117 |
}
|
4007 | 4118 |
|
4008 | 4119 |
Each time this macroinstruction is used, "move" will become other unique name
|
|
4027 | 4138 |
macro strtbl name,[string]
|
4028 | 4139 |
{
|
4029 | 4140 |
common
|
4030 | |
label name dword
|
|
4141 |
label name dword
|
4031 | 4142 |
forward
|
4032 | |
local label
|
4033 | |
dd label
|
|
4143 |
local label
|
|
4144 |
dd label
|
4034 | 4145 |
forward
|
4035 | |
label db string,0
|
|
4146 |
label db string,0
|
4036 | 4147 |
}
|
4037 | 4148 |
|
4038 | 4149 |
First argument given to this macroinstruction will become the label for table
|
|
4080 | 4191 |
|
4081 | 4192 |
macro jif op1,cond,op2,label
|
4082 | 4193 |
{
|
4083 | |
cmp op1,op2
|
4084 | |
j#cond label
|
|
4194 |
cmp op1,op2
|
|
4195 |
j#cond label
|
4085 | 4196 |
}
|
4086 | 4197 |
|
4087 | 4198 |
For example "jif ax,ae,10h,exit" will be assembled as "cmp ax,10h" and
|
|
4097 | 4208 |
|
4098 | 4209 |
macro label name
|
4099 | 4210 |
{
|
4100 | |
label name
|
4101 | |
if ~ used name
|
4102 | |
display `name # " is defined but not used.",13,10
|
4103 | |
end if
|
|
4211 |
label name
|
|
4212 |
if ~ used name
|
|
4213 |
display `name # " is defined but not used.",13,10
|
|
4214 |
end if
|
4104 | 4215 |
}
|
4105 | 4216 |
|
4106 | 4217 |
When label defined with such macro is not used in the source, macro will warn
|
|
4113 | 4224 |
macro message arg
|
4114 | 4225 |
{
|
4115 | 4226 |
if arg eqtype ""
|
4116 | |
local str
|
4117 | |
jmp @f
|
4118 | |
str db arg,0Dh,0Ah,24h
|
4119 | |
@@:
|
4120 | |
mov dx,str
|
|
4227 |
local str
|
|
4228 |
jmp @f
|
|
4229 |
str db arg,0Dh,0Ah,24h
|
|
4230 |
@@:
|
|
4231 |
mov dx,str
|
4121 | 4232 |
else
|
4122 | |
mov dx,arg
|
|
4233 |
mov dx,arg
|
4123 | 4234 |
end if
|
4124 | |
mov ah,9
|
4125 | |
int 21h
|
|
4235 |
mov ah,9
|
|
4236 |
int 21h
|
4126 | 4237 |
}
|
4127 | 4238 |
|
4128 | 4239 |
The above macro is designed for displaying messages in DOS programs. When the
|
|
4147 | 4258 |
{
|
4148 | 4259 |
macro instr op1,op2,op3
|
4149 | 4260 |
\{
|
4150 | |
if op3 eq
|
4151 | |
instr op1,op2
|
4152 | |
else
|
4153 | |
instr op1,op2
|
4154 | |
instr op2,op3
|
4155 | |
end if
|
|
4261 |
if op3 eq
|
|
4262 |
instr op1,op2
|
|
4263 |
else
|
|
4264 |
instr op1,op2
|
|
4265 |
instr op2,op3
|
|
4266 |
end if
|
4156 | 4267 |
\}
|
4157 | 4268 |
}
|
4158 | 4269 |
|
|
4184 | 4295 |
defines an alternative syntax for defining macroinstructions, which looks like:
|
4185 | 4296 |
|
4186 | 4297 |
MACRO stoschar char
|
4187 | |
mov al,char
|
4188 | |
stosb
|
|
4298 |
mov al,char
|
|
4299 |
stosb
|
4189 | 4300 |
ENDM
|
4190 | 4301 |
|
4191 | 4302 |
Note that symbol that has such customized definition must be defined with "fix"
|
|
4226 | 4337 |
|
4227 | 4338 |
struc point x,y
|
4228 | 4339 |
{
|
4229 | |
.x dw x
|
4230 | |
.y dw y
|
|
4340 |
.x dw x
|
|
4341 |
.y dw y
|
4231 | 4342 |
}
|
4232 | 4343 |
|
4233 | 4344 |
For example "my point 7,11" will define structure labeled "my", consisting of
|
|
4242 | 4353 |
struc db [data]
|
4243 | 4354 |
{
|
4244 | 4355 |
common
|
4245 | |
. db data
|
4246 | |
.size = $ - .
|
|
4356 |
. db data
|
|
4357 |
.size = $ - .
|
4247 | 4358 |
}
|
4248 | 4359 |
|
4249 | 4360 |
With such definition "msg db 'Hello!',13,10" will define also "msg.size"
|
|
4277 | 4388 |
|
4278 | 4389 |
rept 3 counter
|
4279 | 4390 |
{
|
4280 | |
byte#counter db counter
|
|
4391 |
byte#counter db counter
|
4281 | 4392 |
}
|
4282 | 4393 |
|
4283 | 4394 |
will generate lines:
|