Commit 091f94a3a9447f10fcc125d62073d8af4a8687a1 - fasm

New upstream version 1.71.60 Tomasz Buchert 7 years ago

7 changed file(s) with 548 addition(s) and 428 deletion(s). Raw diff Collapse all Expand all

fasm less more

Binary diff not shown

+531

-420

fasm.txt less more

0	0
1		,'''
2		,,;,, ,,,, ,,,,, ,,, ,,
3		; ; ; ; ; ;
4		; ,''''; '''', ; ; ;
5		; ',,,,;, ,,,,,' ; ; ;
6
7		flat assembler 1.71
8		Programmer's Manual
	1	,'''
	2	,,;,, ,,,, ,,,,, ,,, ,,
	3	; ; ; ; ; ;
	4	; ,''''; '''', ; ; ;
	5	; ',,,,;, ,,,,,' ; ; ;
	6
	7	flat assembler 1.71
	8	Programmer's Manual
9	9
10	10
11	11	Table of contents

13	13
14	14	Chapter 1 Introduction
15	15
16		1.1 Compiler overview
17		1.1.1 System requirements
18		1.1.2 Executing compiler from command line
19		1.1.3 Compiler messages
20		1.1.4 Output formats
21
22		1.2 Assembly syntax
23		1.2.1 Instruction syntax
24		1.2.2 Data definitions
25		1.2.3 Constants and labels
26		1.2.4 Numerical expressions
27		1.2.5 Jumps and calls
28		1.2.6 Size settings
	16	1.1 Compiler overview
	17	1.1.1 System requirements
	18	1.1.2 Executing compiler from command line
	19	1.1.3 Compiler messages
	20	1.1.4 Output formats
	21
	22	1.2 Assembly syntax
	23	1.2.1 Instruction syntax
	24	1.2.2 Data definitions
	25	1.2.3 Constants and labels
	26	1.2.4 Numerical expressions
	27	1.2.5 Jumps and calls
	28	1.2.6 Size settings
29	29
30	30	Chapter 2 Instruction set
31	31
32		2.1 The x86 architecture instructions
33		2.1.1 Data movement instructions
34		2.1.2 Type conversion instructions
35		2.1.3 Binary arithmetic instructions
36		2.1.4 Decimal arithmetic instructions
37		2.1.5 Logical instructions
38		2.1.6 Control transfer instructions
39		2.1.7 I/O instructions
40		2.1.8 Strings operations
41		2.1.9 Flag control instructions
42		2.1.10 Conditional operations
43		2.1.11 Miscellaneous instructions
44		2.1.12 System instructions
45		2.1.13 FPU instructions
46		2.1.14 MMX instructions
47		2.1.15 SSE instructions
48		2.1.16 SSE2 instructions
49		2.1.17 SSE3 instructions
50		2.1.18 AMD 3DNow! instructions
51		2.1.19 The x86-64 long mode instructions
52		2.1.20 SSE4 instructions
53		2.1.21 AVX instructions
54		2.1.22 AVX2 instructions
55		2.1.23 Auxiliary sets of computational instructions
56		2.1.24 AVX-512 instructions
57		2.1.25 Other extensions of instruction set
58
59		2.2 Control directives
60		2.2.1 Numerical constants
61		2.2.2 Conditional assembly
62		2.2.3 Repeating blocks of instructions
63		2.2.4 Addressing spaces
64		2.2.5 Other directives
65		2.2.6 Multiple passes
66
67		2.3 Preprocessor directives
68		2.3.1 Including source files
69		2.3.2 Symbolic constants
70		2.3.3 Macroinstructions
71		2.3.4 Structures
72		2.3.5 Repeating macroinstructions
73		2.3.6 Conditional preprocessing
74		2.3.7 Order of processing
75
76		2.4 Formatter directives
77		2.4.1 MZ executable
78		2.4.2 Portable Executable
79		2.4.3 Common Object File Format
80		2.4.4 Executable and Linkable Format
	32	2.1 The x86 architecture instructions
	33	2.1.1 Data movement instructions
	34	2.1.2 Type conversion instructions
	35	2.1.3 Binary arithmetic instructions
	36	2.1.4 Decimal arithmetic instructions
	37	2.1.5 Logical instructions
	38	2.1.6 Control transfer instructions
	39	2.1.7 I/O instructions
	40	2.1.8 Strings operations
	41	2.1.9 Flag control instructions
	42	2.1.10 Conditional operations
	43	2.1.11 Miscellaneous instructions
	44	2.1.12 System instructions
	45	2.1.13 FPU instructions
	46	2.1.14 MMX instructions
	47	2.1.15 SSE instructions
	48	2.1.16 SSE2 instructions
	49	2.1.17 SSE3 instructions
	50	2.1.18 AMD 3DNow! instructions
	51	2.1.19 The x86-64 long mode instructions
	52	2.1.20 SSE4 instructions
	53	2.1.21 AVX instructions
	54	2.1.22 AVX2 instructions
	55	2.1.23 Auxiliary sets of computational instructions
	56	2.1.24 AVX-512 instructions
	57	2.1.25 Other extensions of instruction set
	58
	59	2.2 Control directives
	60	2.2.1 Numerical constants
	61	2.2.2 Conditional assembly
	62	2.2.3 Repeating blocks of instructions
	63	2.2.4 Addressing spaces
	64	2.2.5 Other directives
	65	2.2.6 Multiple passes
	66
	67	2.3 Preprocessor directives
	68	2.3.1 Including source files
	69	2.3.2 Symbolic constants
	70	2.3.3 Macroinstructions
	71	2.3.4 Structures
	72	2.3.5 Repeating macroinstructions
	73	2.3.6 Conditional preprocessing
	74	2.3.7 Order of processing
	75
	76	2.4 Formatter directives
	77	2.4.1 MZ executable
	78	2.4.2 Portable Executable
	79	2.4.3 Common Object File Format
	80	2.4.4 Executable and Linkable Format
81	81
82	82
83	83

145	145	destination file.
146	146	The following is an example of the compilation summary:
147	147
148		flat assembler version 1.70 (16384 kilobytes memory)
	148	flat assembler version 1.70 (16384 kilobytes memory)
149	149	38 passes, 5.3 seconds, 77824 bytes.
150	150
151	151	In case of error during the compilation process, the program will display an
152	152	error message. For example, when compiler can't find the input file, it will
153	153	display the following message:
154	154
155		flat assembler version 1.70 (16384 kilobytes memory)
	155	flat assembler version 1.70 (16384 kilobytes memory)
156	156	error: source file not found.
157	157
158	158	If the error is connected with a specific part of source code, the source line
159	159	that caused the error will be also displayed. Also placement of this line in
160	160	the source is given to help you finding this error, for example:
161	161
162		flat assembler version 1.70 (16384 kilobytes memory)
	162	flat assembler version 1.70 (16384 kilobytes memory)
163	163	example.asm [3]:
164		mob ax,1
	164	mob ax,1
165	165	error: illegal instruction.
166	166
167	167	It means that in the third line of the "example.asm" file compiler has

169	169	contains a macroinstruction, also the line in macroinstruction definition
170	170	that generated the erroneous instruction is displayed:
171	171
172		flat assembler version 1.70 (16384 kilobytes memory)
	172	flat assembler version 1.70 (16384 kilobytes memory)
173	173	example.asm [6]:
174		stoschar 7
	174	stoschar 7
175	175	example.asm [3] stoschar [1]:
176		mob al,char
	176	mob al,char
177	177	error: illegal instruction.
178	178
179	179	It means that the macroinstruction in the sixth line of the "example.asm" file

258	258	\| xword \| 128 \| 16 \|
259	259	\| qqword \| 256 \| 32 \|
260	260	\| yword \| 256 \| 32 \|
	261	\| dqqword \| 512 \| 64 \|
	262	\| zword \| 512 \| 64 \|
261	263	\-------------------------/
262	264
263	265	Table 1.2 Registers
264	266	/-----------------------------------------------------------------\
265		\| Type \| Bits \| \|
	267	\| Type \| Bits \| \|
266	268	\|=========\|======\|================================================\|
267		\| \| 8 \| al cl dl bl ah ch dh bh \|
268		\| General \| 16 \| ax cx dx bx sp bp si di \|
269		\| \| 32 \| eax ecx edx ebx esp ebp esi edi \|
	269	\| \| 8 \| al cl dl bl ah ch dh bh \|
	270	\| General \| 16 \| ax cx dx bx sp bp si di \|
	271	\| \| 32 \| eax ecx edx ebx esp ebp esi edi \|
270	272	\|---------\|------\|------------------------------------------------\|
271		\| Segment \| 16 \| es cs ss ds fs gs \|
	273	\| Segment \| 16 \| es cs ss ds fs gs \|
272	274	\|---------\|------\|------------------------------------------------\|
273		\| Control \| 32 \| cr0 cr2 cr3 cr4 \|
	275	\| Control \| 32 \| cr0 cr2 cr3 cr4 \|
274	276	\|---------\|------\|------------------------------------------------\|
275		\| Debug \| 32 \| dr0 dr1 dr2 dr3 dr6 dr7 \|
	277	\| Debug \| 32 \| dr0 dr1 dr2 dr3 dr6 dr7 \|
276	278	\|---------\|------\|------------------------------------------------\|
277		\| FPU \| 80 \| st0 st1 st2 st3 st4 st5 st6 st7 \|
	279	\| FPU \| 80 \| st0 st1 st2 st3 st4 st5 st6 st7 \|
278	280	\|---------\|------\|------------------------------------------------\|
279		\| MMX \| 64 \| mm0 mm1 mm2 mm3 mm4 mm5 mm6 mm7 \|
	281	\| MMX \| 64 \| mm0 mm1 mm2 mm3 mm4 mm5 mm6 mm7 \|
280	282	\|---------\|------\|------------------------------------------------\|
281	283	\| SSE \| 128 \| xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 \|
282	284	\|---------\|------\|------------------------------------------------\|
283	285	\| AVX \| 256 \| ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 \|
	286	\|---------\|------\|------------------------------------------------\|
	287	\| AVX-512 \| 512 \| zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 \|
	288	\|---------\|------\|------------------------------------------------\|
	289	\| Opmask \| 64 \| k0 k1 k2 k3 k4 k5 k6 k7 \|
	290	\|---------\|------\|------------------------------------------------\|
	291	\| Bounds \| 128 \| bnd0 bnd1 bnd2 bnd3 \|
284	292	\-----------------------------------------------------------------/
285	293
286	294

331	339	\| Size \| Define \| Reserve \|
332	340	\| (bytes) \| data \| data \|
333	341	\|=========\|========\|=========\|
334		\| 1 \| db \| rb \|
335		\| \| file \| \|
	342	\| 1 \| db \| rb \|
	343	\| \| file \| \|
336	344	\|---------\|--------\|---------\|
337		\| 2 \| dw \| rw \|
338		\| \| du \| \|
	345	\| 2 \| dw \| rw \|
	346	\| \| du \| \|
339	347	\|---------\|--------\|---------\|
340		\| 4 \| dd \| rd \|
	348	\| 4 \| dd \| rd \|
341	349	\|---------\|--------\|---------\|
342		\| 6 \| dp \| rp \|
343		\| \| df \| rf \|
	350	\| 6 \| dp \| rp \|
	351	\| \| df \| rf \|
344	352	\|---------\|--------\|---------\|
345		\| 8 \| dq \| rq \|
	353	\| 8 \| dq \| rq \|
346	354	\|---------\|--------\|---------\|
347		\| 10 \| dt \| rt \|
	355	\| 10 \| dt \| rt \|
348	356	\----------------------------/
349	357
350	358

454	462	/-------------------------\
455	463	\| Priority \| Operators \|
456	464	\|==========\|==============\|
457		\| 0 \| + - \|
	465	\| 0 \| + - \|
458	466	\|----------\|--------------\|
459		\| 1 \| * / \|
	467	\| 1 \| * / \|
460	468	\|----------\|--------------\|
461		\| 2 \| mod \|
	469	\| 2 \| mod \|
462	470	\|----------\|--------------\|
463		\| 3 \| and or xor \|
	471	\| 3 \| and or xor \|
464	472	\|----------\|--------------\|
465		\| 4 \| shl shr \|
	473	\| 4 \| shl shr \|
466	474	\|----------\|--------------\|
467		\| 5 \| not \|
	475	\| 5 \| not \|
468	476	\|----------\|--------------\|
469		\| 6 \| bsf bsr \|
	477	\| 6 \| bsf bsr \|
470	478	\|----------\|--------------\|
471		\| 7 \| rva plt \|
	479	\| 7 \| rva plt \|
472	480	\-------------------------/
473	481
474	482

553	561	operand are the same. Below are the examples for each of the allowed
554	562	combinations:
555	563
556		mov bx,ax ; general register to general register
	564	mov bx,ax ; general register to general register
557	565	mov [char],al ; general register to memory
558	566	mov bl,[char] ; memory to general register
559		mov dl,32 ; immediate value to general register
	567	mov dl,32 ; immediate value to general register
560	568	mov [char],32 ; immediate value to memory
561		mov ax,ds ; segment register to general register
	569	mov ax,ds ; segment register to general register
562	570	mov [bx],ds ; segment register to memory
563		mov ds,ax ; general register to segment register
	571	mov ds,ax ; general register to segment register
564	572	mov ds,[bx] ; memory to segment register
565	573	mov eax,cr0 ; control register to general register
566	574	mov cr3,ebx ; general register to control register

570	578	important. The operands may be two general registers, or general register
571	579	with memory. For example:
572	580
573		xchg ax,bx ; swap two general registers
	581	xchg ax,bx ; swap two general registers
574	582	xchg al,[char] ; swap register with memory
575	583
576	584	"push" decrements the stack frame pointer (ESP register), then transfers

584	592	spaces, not commas), compiler will assemble chain of the "push" instructions
585	593	with these operands. The examples are with single operands:
586	594
587		push ax ; store general register
588		push es ; store segment register
589		pushw [bx] ; store memory
590		push 1000h ; store immediate value
	595	push ax ; store general register
	596	push es ; store segment register
	597	pushw [bx] ; store memory
	598	push 1000h ; store immediate value
591	599
592	600	"pusha" saves the contents of the eight general register on the stack.
593	601	This instruction has no operands. There are two version of this instruction,

606	614	follow in the same line, compiler will assemble chain of the "pop"
607	615	instructions with these operands.
608	616
609		pop bx ; restore general register
610		pop ds ; restore segment register
611		popw [si] ; restore memory
	617	pop bx ; restore general register
	618	pop ds ; restore segment register
	619	popw [si] ; restore memory
612	620
613	621	"popa" restores the registers saved on the stack by "pusha" instruction,
614	622	except for the saved value of SP (or ESP), which is ignored. This instruction

634	642	extension. The source operand can be general register or memory, while the
635	643	destination operand must be a general register. For example:
636	644
637		movsx ax,al ; byte register to word register
638		movsx edx,dl ; byte register to double word register
639		movsx eax,ax ; word register to double word register
640		movsx ax,byte [bx] ; byte memory to word register
	645	movsx ax,al ; byte register to word register
	646	movsx edx,dl ; byte register to double word register
	647	movsx eax,ax ; word register to double word register
	648	movsx ax,byte [bx] ; byte memory to word register
641	649	movsx edx,byte [bx] ; byte memory to double word register
642	650	movsx eax,word [bx] ; word memory to double word register
643	651

650	658	register or memory, the source operand can be general register or immediate
651	659	value, it can also be memory if the destination operand is register.
652	660
653		add ax,bx ; add register to register
	661	add ax,bx ; add register to register
654	662	add ax,[si] ; add memory to register
655	663	add [di],al ; add register to memory
656		add al,48 ; add immediate value to register
	664	add al,48 ; add immediate value to register
657	665	add [char],48 ; add immediate value to memory
658	666
659	667	"adc" sums the operands, adds one if CF is set, and replaces the destination

664	672	general register or memory, and the size of the operand can be byte, word or
665	673	double word.
666	674
667		inc ax ; increment register by one
	675	inc ax ; increment register by one
668	676	inc byte [bx] ; increment memory by one
669	677
670	678	"sub" subtracts the source operand from the destination operand and replaces

720	728	because, whether the operands are signed or unsigned, the lower half of the
721	729	product is the same. Below are the examples for all three forms:
722	730
723		imul bl ; accumulator by register
	731	imul bl ; accumulator by register
724	732	imul word [si] ; accumulator by memory
725		imul bx,cx ; register by register
	733	imul bx,cx ; register by register
726	734	imul bx,[si] ; register by memory
727		imul bx,10 ; register by immediate value
	735	imul bx,10 ; register by immediate value
728	736	imul ax,bx,10 ; register by immediate value to register
729	737	imul ax,[si],10 ; memory by immediate value to register
730	738

805	813	1, "btr" resets the selected bit to 0, "btc" changes the bit to its
806	814	complement. The first operand can be word or double word.
807	815
808		bt ax,15 ; test bit in register
	816	bt ax,15 ; test bit in register
809	817	bts word [bx],15 ; test and set bit in memory
810		btr ax,cx ; test and reset bit in register
	818	btr ax,cx ; test and reset bit in register
811	819	btc word [bx],cx ; test and complement bit in memory
812	820
813	821	"bsf" and "bsr" instructions scan a word or double word for first set bit

820	828	order to low order (starting from bit index 15 of a word or index 31 of a
821	829	double word).
822	830
823		bsf ax,bx ; scan register forward
	831	bsf ax,bx ; scan register forward
824	832	bsr ax,[si] ; scan memory reverse
825	833
826	834	"shl" shifts the destination operand left by the number of bits specified

830	838	side of the operand as bits exit from the left side. The last bit that exited
831	839	is stored in CF. "sal" is a synonym for "shl".
832	840
833		shl al,1 ; shift register left by one bit
	841	shl al,1 ; shift register left by one bit
834	842	shl byte [bx],1 ; shift memory left by one bit
835		shl ax,cl ; shift register left by count from cl
	843	shl ax,cl ; shift register left by count from cl
836	844	shl word [bx],cl ; shift memory left by count from cl
837	845
838	846	"shr" and "sar" shift the destination operand right by the number of bits

879	887	bits 16 through 23. This instruction is provided for converting little-endian
880	888	values to big-endian format and vice versa.
881	889
882		bswap edx ; swap bytes in register
	890	bswap edx ; swap bytes in register
883	891
884	892
885	893	2.1.6 Control transfer instructions

903	911	variable, the operand should be general register or memory. See also 1.2.5 for
904	912	some more details.
905	913
906		jmp 100h ; direct near jump
	914	jmp 100h ; direct near jump
907	915	jmp 0FFFFh:0 ; direct far jump
908		jmp ax ; indirect near jump
	916	jmp ax ; indirect near jump
909	917	jmp pword [ebx] ; indirect far jump
910	918
911	919	"call" transfers control to the procedure, saving on the stack the address

942	950
943	951	Table 2.1 Conditions
944	952	/-----------------------------------------------------------\
945		\| Mnemonic \| Condition tested \| Description \|
	953	\| Mnemonic \| Condition tested \| Description \|
946	954	\|==========\|=======================\|========================\|
947		\| o \| OF = 1 \| overflow \|
	955	\| o \| OF = 1 \| overflow \|
948	956	\|----------\|-----------------------\|------------------------\|
949		\| no \| OF = 0 \| not overflow \|
	957	\| no \| OF = 0 \| not overflow \|
950	958	\|----------\|-----------------------\|------------------------\|
951		\| c \| \| carry \|
952		\| b \| CF = 1 \| below \|
953		\| nae \| \| not above nor equal \|
	959	\| c \| \| carry \|
	960	\| b \| CF = 1 \| below \|
	961	\| nae \| \| not above nor equal \|
954	962	\|----------\|-----------------------\|------------------------\|
955		\| nc \| \| not carry \|
956		\| ae \| CF = 0 \| above or equal \|
957		\| nb \| \| not below \|
	963	\| nc \| \| not carry \|
	964	\| ae \| CF = 0 \| above or equal \|
	965	\| nb \| \| not below \|
958	966	\|----------\|-----------------------\|------------------------\|
959		\| e \| ZF = 1 \| equal \|
960		\| z \| \| zero \|
	967	\| e \| ZF = 1 \| equal \|
	968	\| z \| \| zero \|
961	969	\|----------\|-----------------------\|------------------------\|
962		\| ne \| ZF = 0 \| not equal \|
963		\| nz \| \| not zero \|
	970	\| ne \| ZF = 0 \| not equal \|
	971	\| nz \| \| not zero \|
964	972	\|----------\|-----------------------\|------------------------\|
965		\| be \| CF or ZF = 1 \| below or equal \|
966		\| na \| \| not above \|
	973	\| be \| CF or ZF = 1 \| below or equal \|
	974	\| na \| \| not above \|
967	975	\|----------\|-----------------------\|------------------------\|
968		\| a \| CF or ZF = 0 \| above \|
969		\| nbe \| \| not below nor equal \|
	976	\| a \| CF or ZF = 0 \| above \|
	977	\| nbe \| \| not below nor equal \|
970	978	\|----------\|-----------------------\|------------------------\|
971		\| s \| SF = 1 \| sign \|
	979	\| s \| SF = 1 \| sign \|
972	980	\|----------\|-----------------------\|------------------------\|
973		\| ns \| SF = 0 \| not sign \|
	981	\| ns \| SF = 0 \| not sign \|
974	982	\|----------\|-----------------------\|------------------------\|
975		\| p \| PF = 1 \| parity \|
976		\| pe \| \| parity even \|
	983	\| p \| PF = 1 \| parity \|
	984	\| pe \| \| parity even \|
977	985	\|----------\|-----------------------\|------------------------\|
978		\| np \| PF = 0 \| not parity \|
979		\| po \| \| parity odd \|
	986	\| np \| PF = 0 \| not parity \|
	987	\| po \| \| parity odd \|
980	988	\|----------\|-----------------------\|------------------------\|
981		\| l \| SF xor OF = 1 \| less \|
982		\| nge \| \| not greater nor equal \|
	989	\| l \| SF xor OF = 1 \| less \|
	990	\| nge \| \| not greater nor equal \|
983	991	\|----------\|-----------------------\|------------------------\|
984		\| ge \| SF xor OF = 0 \| greater or equal \|
985		\| nl \| \| not less \|
	992	\| ge \| SF xor OF = 0 \| greater or equal \|
	993	\| nl \| \| not less \|
986	994	\|----------\|-----------------------\|------------------------\|
987		\| le \| (SF xor OF) or ZF = 1 \| less or equal \|
988		\| ng \| \| not greater \|
	995	\| le \| (SF xor OF) or ZF = 1 \| less or equal \|
	996	\| ng \| \| not greater \|
989	997	\|----------\|-----------------------\|------------------------\|
990		\| g \| (SF xor OF) or ZF = 0 \| greater \|
991		\| nle \| \| not less nor equal \|
	998	\| g \| (SF xor OF) or ZF = 0 \| greater \|
	999	\| nle \| \| not less nor equal \|
992	1000	\-----------------------------------------------------------/
993	1001
994	1002	The "loop" instructions are conditional jumps that use a value placed in

1037	1045	operand should be AL, AX, or EAX register. The source operand should be an
1038	1046	immediate value in range from 0 to 255, or DX register.
1039	1047
1040		in al,20h ; input byte from port 20h
1041		in ax,dx ; input word from port addressed by dx
	1048	in al,20h ; input byte from port 20h
	1049	in ax,dx ; input word from port addressed by dx
1042	1050
1043	1051	"out" transfers a byte, word, or double word to an output port from AL, AX,
1044	1052	or EAX. The program can specify the number of the port using the same methods

1046	1054	in range from 0 to 255, or DX register. The source operand should be AL, AX,
1047	1055	or EAX register.
1048	1056
1049		out 20h,ax ; output word to port 20h
1050		out dx,al ; output byte to port addressed by dx
	1057	out 20h,ax ; output word to port 20h
	1058	out dx,al ; output byte to port addressed by dx
1051	1059
1052	1060
1053	1061	2.1.8 Strings operations

1077	1085
1078	1086	movs byte [di],[si] ; transfer byte
1079	1087	movs word [es:di],[ss:si] ; transfer word
1080		movsd ; transfer double word
	1088	movsd ; transfer double word
1081	1089
1082	1090	"cmps" subtracts the destination string element from the source string
1083	1091	element and updates the flags AF, SF, PF, CF and OF, but it does not change

1087	1095	second operand should be the destination string element addressed by DI or
1088	1096	EDI.
1089	1097
1090		cmpsb ; compare bytes
	1098	cmpsb ; compare bytes
1091	1099	cmps word [ds:si],[es:di] ; compare words
1092	1100	cmps dword [fs:esi],[edi] ; compare double words
1093	1101

1096	1104	PF, CF and OF. If the values are equal, ZF is set, otherwise it is cleared.
1097	1105	The operand should be the destination string element addressed by DI or EDI.
1098	1106
1099		scas byte [es:di] ; scan byte
1100		scasw ; scan word
	1107	scas byte [es:di] ; scan byte
	1108	scasw ; scan word
1101	1109	scas dword [es:edi] ; scan double word
1102	1110
1103	1111	"stos" places the value of AL, AX, or EAX into the destination string

1106	1114	should be the source string element addressed by SI or ESI with any segment
1107	1115	prefix.
1108	1116
1109		lods byte [ds:si] ; load byte
1110		lods word [cs:si] ; load word
1111		lodsd ; load double word
	1117	lods byte [ds:si] ; load byte
	1118	lods word [cs:si] ; load word
	1119	lodsd ; load double word
1112	1120
1113	1121	"ins" transfers a byte, word, or double word from an input port addressed
1114	1122	by DX register to the destination string element. The destination operand
1115	1123	should be memory addressed by DI or EDI, the source operand should be the DX
1116	1124	register.
1117	1125
1118		insb ; input byte
	1126	insb ; input byte
1119	1127	ins word [es:di],dx ; input word
1120		ins dword [edi],dx ; input double word
	1128	ins dword [edi],dx ; input double word
1121	1129
1122	1130	"outs" transfers the source string element to an output port addressed by
1123	1131	DX register. The destination operand should be the DX register and the source
1124	1132	operand should be memory addressed by SI or ESI with any segment prefix.
1125	1133
1126		outs dx,byte [si] ; output byte
1127		outsw ; output word
	1134	outs dx,byte [si] ; output byte
	1135	outsw ; output word
1128	1136	outs dx,dword [gs:esi] ; output double word
1129	1137
1130	1138	The repeat prefixes "rep", "repe"/"repz", and "repne"/"repnz" specify

1141	1149	the execution when the ZF is zero, "repne" and "repnz" terminate the execution
1142	1150	when the ZF is set.
1143	1151
1144		rep movsd ; transfer multiple double words
1145		repe cmpsb ; compare bytes until not equal
	1152	rep movsd ; transfer multiple double words
	1153	repe cmpsb ; compare bytes until not equal
1146	1154
1147	1155
1148	1156	2.1.9 Flag control instructions

1169	1177	and "popfd" forces restoring from the double word.
1170	1178
1171	1179
1172		2.1.10 Conditional operations
	1180	2.1.10 Conditional operations
1173	1181
1174	1182	The instructions obtained by attaching the condition mnemonic (see table
1175	1183	2.1) to the "set" mnemonic set a byte to one if the condition is true and set
1176	1184	the byte to zero otherwise. The operand should be an 8-bit be general register
1177	1185	or the byte in memory.
1178	1186
1179		setne al ; set al if zero flag cleared
	1187	setne al ; set al if zero flag cleared
1180	1188	seto byte [bx] ; set byte if overflow
1181	1189
1182	1190	"salc" instruction sets the all bits of AL register when the carry flag is

1208	1216	cmpxchg8b [bx] ; compare and exchange 8 bytes
1209	1217
1210	1218
1211		2.1.11 Miscellaneous instructions
	1219	2.1.11 Miscellaneous instructions
1212	1220
1213	1221	"nop" instruction occupies one byte but affects nothing but the instruction
1214	1222	pointer. This instruction has no operands and doesn't perform any operation.

1269	1277	enter 2048,0 ; enter and allocate 2048 bytes on stack
1270	1278
1271	1279
1272		2.1.12 System instructions
	1280	2.1.12 System instructions
1273	1281
1274	1282	"lmsw" loads the operand into the machine status word (bits 0 through 15 of
1275	1283	CR0 register), while "smsw" stores the machine status word into the

1277	1285	general register or memory, for "smsw" it can also be 32-bit general
1278	1286	register.
1279	1287
1280		lmsw ax ; load machine status from register
1281		smsw [bx] ; store machine status to memory
	1288	lmsw ax ; load machine status from register
	1289	smsw [bx] ; store machine status to memory
1282	1290
1283	1291	"lgdt" and "lidt" instructions load the values in operand into the global
1284	1292	descriptor table register or the interrupt descriptor table register

1286	1294	table register or the interrupt descriptor table register in the destination
1287	1295	operand. The operand should be a 6 bytes in memory.
1288	1296
1289		lgdt [ebx] ; load global descriptor table
	1297	lgdt [ebx] ; load global descriptor table
1290	1298
1291	1299	"lldt" loads the operand into the segment selector field of the local
1292	1300	descriptor table register and "sldt" stores the segment selector from the

1300	1308	The source operand should be a 16-bit general register or memory.
1301	1309
1302	1310	lar ax,[bx] ; load access rights into word
1303		lar eax,dx ; load access rights into double word
	1311	lar eax,dx ; load access rights into double word
1304	1312
1305	1313	"lsl" loads the segment limit from the segment descriptor specified by the
1306	1314	selector in source operand into the destination operand and sets the ZF flag.

1320	1328	destination operand. The destination operand can be a word general register
1321	1329	or memory, the source operand must be a general register.
1322	1330
1323		arpl bx,ax ; adjust RPL of selector in register
	1331	arpl bx,ax ; adjust RPL of selector in register
1324	1332	arpl [bx],ax ; adjust RPL of selector in memory
1325	1333
1326	1334	"clts" clears the TS (task switched) flag in the CR0 register. This

1366	1374	instructions are stored in MSRs. These instructions have no operands.
1367	1375
1368	1376
1369		2.1.13 FPU instructions
	1377	2.1.13 FPU instructions
1370	1378
1371	1379	The FPU (Floating-Point Unit) instructions operate on the floating-point
1372	1380	values in three formats: single precision (32-bit), double precision (64-bit)

1382	1390	format.
1383	1391
1384	1392	fld dword [bx] ; load single prevision value from memory
1385		fld st2 ; push value of st2 onto register stack
	1393	fld st2 ; push value of st2 onto register stack
1386	1394
1387	1395	"fld1", "fldz", "fldl2t", "fldl2e", "fldpi", "fldlg2" and "fldln2" load the
1388	1396	commonly used contants onto the FPU register stack. The loaded constants are

1400	1408	getting rid of ST0. "fstp" accepts the same operands as the "fst" instruction
1401	1409	and can also store value in the 80-bit memory.
1402	1410
1403		fst st3 ; copy value of st0 into st3 register
	1411	fst st3 ; copy value of st0 into st3 register
1404	1412	fstp tword [bx] ; store value in memory and pop stack
1405	1413
1406	1414	"fist" converts the value in ST0 to a signed integer and stores the result

1429	1437	must be an FPU register and the source operand must be the ST0. When no
1430	1438	operands are specified, ST1 is used as a destination operand.
1431	1439
1432		faddp ; add st0 to st1 and pop the stack
	1440	faddp ; add st0 to st1 and pop the stack
1433	1441	faddp st2,st0 ; add st0 to st2 and pop the stack
1434	1442
1435	1443	"fiadd" instruction converts an integer source operand into double extended

1440	1448
1441	1449	"fsub", "fsubr", "fmul", "fdiv", "fdivr" instruction are similar to "fadd",
1442	1450	have the same rules for operands and differ only in the perfomed computation.
1443		"fsub" substracts the source operand from the destination operand, "fsubr"
1444		substract the destination operand from the source operand, "fmul" multiplies
	1451	"fsub" subtracts the source operand from the destination operand, "fsubr"
	1452	subtract the destination operand from the source operand, "fmul" multiplies
1445	1453	the destination and source operands, "fdiv" divides the destination operand by
1446	1454	the source operand and "fdivr" divides the source operand by the destination
1447	1455	operand. "fsubp", "fsubrp", "fmulp", "fdivp", "fdivrp" perform the same

1455	1463	"fchs" complements its sign bit, "fabs" clears its sign to create the absolute
1456	1464	value, "frndint" rounds it to the nearest integral value, depending on the
1457	1465	current rounding mode. "f2xm1" computes the exponential value of 2 to the
1458		power of ST0 and substracts the 1.0 from it, the value of ST0 must lie in the
	1466	power of ST0 and subtracts the 1.0 from it, the value of ST0 must lie in the
1459	1467	range -1.0 to +1.0. All these instructions store the result in ST0 and have no
1460	1468	operands.
1461	1469	"fsincos" computes both the sine and the cosine of the value in ST0

1483	1491	operand can be a single or double precision value in memory or the FPU
1484	1492	register. When no operand is specified, ST1 is used as a source operand.
1485	1493
1486		fcom ; compare st0 with st1
1487		fcomp st2 ; compare st0 with st2 and pop stack
	1494	fcom ; compare st0 with st1
	1495	fcomp st2 ; compare st0 with st2 and pop stack
1488	1496
1489	1497	"fcompp" compares the contents of ST0 and ST1, sets flags in the FPU status
1490	1498	word according to the results and pops the register stack twice. This

1512	1520	should be ST0 register and the second operand specifies the source FPU
1513	1521	register.
1514	1522
1515		fcomi st2 ; compare st0 with st2 and set flags
	1523	fcomi st2 ; compare st0 with st2 and set flags
1516	1524	fcmovb st0,st2 ; transfer st2 to st0 if below
1517	1525
1518	1526	Table 2.2 FPU conditions
1519	1527	/------------------------------------------------------\
1520		\| Mnemonic \| Condition tested \| Description \|
	1528	\| Mnemonic \| Condition tested \| Description \|
1521	1529	\|==========\|==================\|========================\|
1522		\| b \| CF = 1 \| below \|
1523		\| e \| ZF = 1 \| equal \|
1524		\| be \| CF or ZF = 1 \| below or equal \|
1525		\| u \| PF = 1 \| unordered \|
1526		\| nb \| CF = 0 \| not below \|
1527		\| ne \| ZF = 0 \| not equal \|
1528		\| nbe \| CF and ZF = 0 \| not below nor equal \|
1529		\| nu \| PF = 0 \| not unordered \|
	1530	\| b \| CF = 1 \| below \|
	1531	\| e \| ZF = 1 \| equal \|
	1532	\| be \| CF or ZF = 1 \| below or equal \|
	1533	\| u \| PF = 1 \| unordered \|
	1534	\| nb \| CF = 0 \| not below \|
	1535	\| ne \| ZF = 0 \| not equal \|
	1536	\| nbe \| CF and ZF = 0 \| not below nor equal \|
	1537	\| nu \| PF = 0 \| not unordered \|
1530	1538	\------------------------------------------------------/
1531	1539
1532	1540	"ftst" compares the value in ST0 with 0.0 and sets the flags in the FPU

1568	1576	"ffree" sets the tag associated with specified FPU register to empty. The
1569	1577	operand should be an FPU register.
1570	1578	"fincstp" and "fdecstp" rotate the FPU stack by one by adding or
1571		substracting one to the pointer of the top of stack. These instructions have no
	1579	subtracting one to the pointer of the top of stack. These instructions have no
1572	1580	operands.
1573	1581
1574	1582
1575		2.1.14 MMX instructions
	1583	2.1.14 MMX instructions
1576	1584
1577	1585	The MMX instructions operate on the packed integer types and use the MMX
1578	1586	registers, which are the low 64-bit parts of the 80-bit FPU registers. Because

1597	1605	source and destination operand and stored in the data elements of the
1598	1606	destination operand. "paddb", "paddw" and "paddd" perform the addition of
1599	1607	packed bytes, packed words, or packed double words. "psubb", "psubw" and
1600		"psubd" perform the substraction of appropriate types. "paddsb", "paddsw",
1601		"psubsb" and "psubsw" perform the addition or substraction of packed bytes
	1608	"psubd" perform the subtraction of appropriate types. "paddsb", "paddsw",
	1609	"psubsb" and "psubsw" perform the addition or subtraction of packed bytes
1602	1610	or packed words with the signed saturation. "paddusb", "paddusw", "psubusb",
1603	1611	"psubusw" are analoguous, but with unsigned saturation. "pmulhw" and "pmullw"
1604	1612	performs a signed multiplication of the packed words and store the high or low

1644	1652	used before using the FPU instructions if any MMX instructions were used.
1645	1653
1646	1654
1647		2.1.15 SSE instructions
	1655	2.1.15 SSE instructions
1648	1656
1649	1657	The SSE extension adds more MMX instructions and also introduces the
1650	1658	operations on packed single precision floating point values. The 128-bit

1695	1703	must be a SSE register and the operation is performed on single precision
1696	1704	values, only low double words of SSE registers are used in this case, the
1697	1705	result is stored in the low double word of destination register. "addps" and
1698		"addss" add the values, "subps" and "subss" substract the source value from
	1706	"addss" add the values, "subps" and "subss" subtract the source value from
1699	1707	destination value, "mulps" and "mulss" multiply the values, "divps" and
1700	1708	"divss" divide the destination value by the source value, "rcpps" and "rcpss"
1701	1709	compute the approximate reciprocal of the source value, "sqrtps" and "sqrtss"

1728	1736
1729	1737	Table 2.3 SSE conditions
1730	1738	/-------------------------------------------\
1731		\| Code \| Mnemonic \| Description \|
	1739	\| Code \| Mnemonic \| Description \|
1732	1740	\|======\|==========\|=========================\|
1733		\| 0 \| eq \| equal \|
1734		\| 1 \| lt \| less than \|
1735		\| 2 \| le \| less than or equal \|
1736		\| 3 \| unord \| unordered \|
1737		\| 4 \| neq \| not equal \|
1738		\| 5 \| nlt \| not less than \|
1739		\| 6 \| nle \| not less than nor equal \|
1740		\| 7 \| ord \| ordered \|
	1741	\| 0 \| eq \| equal \|
	1742	\| 1 \| lt \| less than \|
	1743	\| 2 \| le \| less than or equal \|
	1744	\| 3 \| unord \| unordered \|
	1745	\| 4 \| neq \| not equal \|
	1746	\| 5 \| nlt \| not less than \|
	1747	\| 6 \| nle \| not less than nor equal \|
	1748	\| 7 \| ord \| ordered \|
1741	1749	\-------------------------------------------/
1742	1750
1743	1751	"comiss" and "ucomiss" compare the single precision values and set the ZF,

1859	1867	of no specified size.
1860	1868
1861	1869
1862		2.1.16 SSE2 instructions
	1870	2.1.16 SSE2 instructions
1863	1871
1864	1872	The SSE2 extension introduces the operations on packed double precision
1865	1873	floating point values, extends the syntax of MMX instructions, and adds also

1980	1988	is introduced, which performs the same operation as "pshufw", but on the
1981	1989	double words instead of words, it allows only the extended syntax.
1982	1990
1983		psubb xmm0,[esi] ; substract 16 packed bytes
	1991	psubb xmm0,[esi] ; subtract 16 packed bytes
1984	1992	pextrw eax,xmm0,7 ; extract highest word into eax
1985	1993
1986	1994	"paddq" performs the addition of packed quad words, "psubq" performs the
1987		substraction of packed quad words, "pmuludq" performs an unsigned
	1995	subtraction of packed quad words, "pmuludq" performs an unsigned
1988	1996	multiplication of low double words from each corresponding quad words and
1989	1997	returns the results in packed quad words. These instructions follow the same
1990	1998	rules for operands as the general MMX operations described in 2.1.14.

2020	2028	"lfence" instructions. These instructions have no operands.
2021	2029
2022	2030
2023		2.1.17 SSE3 instructions
	2031	2.1.17 SSE3 instructions
2024	2032
2025	2033	Prescott technology introduced some new instructions to improve the performance
2026	2034	of SSE and SSE2 - this extension is called SSE3.

2041	2049	cacheline boundary. The destination operand has to be SSE register, the source
2042	2050	operand must be 128-bit memory location.
2043	2051	"addsubps" performs single precision addition of second and fourth pairs and
2044		single precision substracion of the first and third pairs of floating point
	2052	single precision subtracion of the first and third pairs of floating point
2045	2053	values in the operands. "addsubpd" performs double precision addition of the
2046		second pair and double precision substraction of the first pair of floating
	2054	second pair and double precision subtraction of the first pair of floating
2047	2055	point values in the operand. "haddps" performs the addition of two single
2048	2056	precision values within the each quad word of source and destination operands,
2049	2057	and stores the results of such horizontal addition of values from destination

2068	2076	destination register). They operate on 16-bit or 32-bit chunks, respectively.
2069	2077	"phaddsw" performs the same operation on signed 16-bit packed values, but the
2070	2078	result of each addition is saturated. "phsubw" and "phsubd" analogously
2071		perform the horizontal substraction of 16-bit or 32-bit packed value, and
2072		"phsubsw" performs the horizontal substraction of signed 16-bit packed values
	2079	perform the horizontal subtraction of 16-bit or 32-bit packed value, and
	2080	"phsubsw" performs the horizontal subtraction of signed 16-bit packed values
2073	2081	with saturation.
2074	2082	"pabsb", "pabsw" and "pabsd" calculate the absolute value of each signed
2075	2083	packed signed value in source operand and stores them into the destination

2099	2107	is the only SSSE3 instruction that takes three arguments.
2100	2108
2101	2109
2102		2.1.18 AMD 3DNow! instructions
	2110	2.1.18 AMD 3DNow! instructions
2103	2111
2104	2112	The 3DNow! extension adds a new MMX instructions to those described in 2.1.14,
2105	2113	and introduces operation on the 64-bit packed floating point values, each

2116	2124	double word in source operand are used. "pf2iw" converts packed floating
2117	2125	point values to packed word integers, results are extended to double words
2118	2126	using the sign extension. "pfadd" adds packed floating point values. "pfsub"
2119		and "pfsubr" substracts packed floating point values, the first one substracts
2120		source values from destination values, the second one substracts destination
	2127	and "pfsubr" subtracts packed floating point values, the first one subtracts
	2128	source values from destination values, the second one subtracts destination
2121	2129	values from the source values. "pfmul" multiplies packed floating point
2122	2130	values. "pfacc" adds the low and high floating point values of the destination
2123	2131	operand, storing the result in the low double word of destination, and adds
2124	2132	the low and high floating point values of the source operand, storing the
2125		result in the high double word of destination. "pfnacc" substracts the high
	2133	result in the high double word of destination. "pfnacc" subtracts the high
2126	2134	floating point value of the destination operand from the low, storing the
2127		result in the low double word of destination, and substracts the high floating
	2135	result in the low double word of destination, and subtracts the high floating
2128	2136	point value of the source operand from the low, storing the result in the high
2129		double word of destination. "pfpnacc" substracts the high floating point value
	2137	double word of destination. "pfpnacc" subtracts the high floating point value
2130	2138	of the destination operand from the low, storing the result in the low double
2131	2139	word of destination, and adds the low and high floating point values of the
2132	2140	source operand, storing the result in the high double word of destination.

2156	2164	operands.
2157	2165
2158	2166
2159		2.1.19 The x86-64 long mode instructions
	2167	2.1.19 The x86-64 long mode instructions
2160	2168
2161	2169	The AMD64 and EM64T architectures (we will use the common name x86-64 for them
2162	2170	both) extend the x86 instruction set for the 64-bit processing. While legacy

2174	2182
2175	2183	Table 2.4 New registers in long mode
2176	2184	/--------------------------------------------------\
2177		\| Type \| General \| SSE \| AVX \|
	2185	\| Type \| General \| SSE \| AVX \|
2178	2186	\|------\|---------------------------\|-------\|-------\|
2179		\| Bits \| 8 \| 16 \| 32 \| 64 \| 128 \| 256 \|
	2187	\| Bits \| 8 \| 16 \| 32 \| 64 \| 128 \| 256 \|
2180	2188	\|======\|======\|======\|======\|======\|=======\|=======\|
2181		\| \| \| \| \| rax \| \| \|
2182		\| \| \| \| \| rcx \| \| \|
2183		\| \| \| \| \| rdx \| \| \|
2184		\| \| \| \| \| rbx \| \| \|
2185		\| \| spl \| \| \| rsp \| \| \|
2186		\| \| bpl \| \| \| rbp \| \| \|
2187		\| \| sil \| \| \| rsi \| \| \|
2188		\| \| dil \| \| \| rdi \| \| \|
2189		\| \| r8b \| r8w \| r8d \| r8 \| xmm8 \| ymm8 \|
2190		\| \| r9b \| r9w \| r9d \| r9 \| xmm9 \| ymm9 \|
2191		\| \| r10b \| r10w \| r10d \| r10 \| xmm10 \| ymm10 \|
2192		\| \| r11b \| r11w \| r11d \| r11 \| xmm11 \| ymm11 \|
2193		\| \| r12b \| r12w \| r12d \| r12 \| xmm12 \| ymm12 \|
2194		\| \| r13b \| r13w \| r13d \| r13 \| xmm13 \| ymm13 \|
2195		\| \| r14b \| r14w \| r14d \| r14 \| xmm14 \| ymm14 \|
2196		\| \| r15b \| r15w \| r15d \| r15 \| xmm15 \| ymm15 \|
	2189	\| \| \| \| \| rax \| \| \|
	2190	\| \| \| \| \| rcx \| \| \|
	2191	\| \| \| \| \| rdx \| \| \|
	2192	\| \| \| \| \| rbx \| \| \|
	2193	\| \| spl \| \| \| rsp \| \| \|
	2194	\| \| bpl \| \| \| rbp \| \| \|
	2195	\| \| sil \| \| \| rsi \| \| \|
	2196	\| \| dil \| \| \| rdi \| \| \|
	2197	\| \| r8b \| r8w \| r8d \| r8 \| xmm8 \| ymm8 \|
	2198	\| \| r9b \| r9w \| r9d \| r9 \| xmm9 \| ymm9 \|
	2199	\| \| r10b \| r10w \| r10d \| r10 \| xmm10 \| ymm10 \|
	2200	\| \| r11b \| r11w \| r11d \| r11 \| xmm11 \| ymm11 \|
	2201	\| \| r12b \| r12w \| r12d \| r12 \| xmm12 \| ymm12 \|
	2202	\| \| r13b \| r13w \| r13d \| r13 \| xmm13 \| ymm13 \|
	2203	\| \| r14b \| r14w \| r14d \| r14 \| xmm14 \| ymm14 \|
	2204	\| \| r15b \| r15w \| r15d \| r15 \| xmm15 \| ymm15 \|
2197	2205	\--------------------------------------------------/
2198	2206
2199	2207	In general any instruction from x86 architecture, which allowed 16-bit or

2203	2211	registers. Below are the samples of new operations possible in long mode on the
2204	2212	example of "mov" instruction:
2205	2213
2206		mov rax,r8 ; transfer 64-bit general register
	2214	mov rax,r8 ; transfer 64-bit general register
2207	2215	mov al,[rbx] ; transfer memory addressed by 64-bit register
2208	2216
2209	2217	The long mode uses also the instruction pointer based addresses, you can

2283	2291	and "wrmsr" instructions.
2284	2292
2285	2293
2286		2.1.20 SSE4 instructions
	2294	2.1.20 SSE4 instructions
2287	2295
2288	2296	There are actually three different sets of instructions under the name SSE4.
2289	2297	Intel designed two of them, SSE4.1 and SSE4.2, with latter extending the

2420	2428	destination operand, the source can be 64-bit memory or SSE register.
2421	2429
2422	2430	pmovzxbq xmm0,word [si] ; zero-extend bytes to quad words
2423		pmovsxwq xmm0,xmm1 ; sign-extend words to quad words
	2431	pmovsxwq xmm0,xmm1 ; sign-extend words to quad words
2424	2432
2425	2433	"movntdqa" loads double quad word from the source operand to the destination
2426	2434	using a non-temporal hint. The destination operand should be SSE register,

2450	2458	also be a 64-bit general purpose register, and the source operand in such case
2451	2459	can be a byte or quad word register or memory location.
2452	2460
2453		crc32 eax,dl ; accumulate CRC32 on byte value
	2461	crc32 eax,dl ; accumulate CRC32 on byte value
2454	2462	crc32 eax,word [ebx] ; accumulate CRC32 on word value
2455	2463	crc32 rax,qword [rbx] ; accumulate CRC32 on quad word value
2456	2464

2460	2468	the same size as source operand. The 64-bit variant is available only in long
2461	2469	mode.
2462	2470
2463		popcnt ecx,eax ; count bits set to 1
	2471	popcnt ecx,eax ; count bits set to 1
2464	2472
2465	2473	The SSE4a extension, which also includes the "popcnt" instruction introduced
2466	2474	by SSE4.2, at the same time adds the "lzcnt" instruction, which follows the

2475	2483	is no third operand in such case), which should contain position value in bits
2476	2484	8-13 and length of bit string in bits 0-5.
2477	2485
2478		extrq xmm0,8,7 ; extract 8 bits from position 7
2479		extrq xmm0,xmm5 ; extract bits defined by register
	2486	extrq xmm0,8,7 ; extract 8 bits from position 7
	2487	extrq xmm0,xmm5 ; extract bits defined by register
2480	2488
2481	2489	"insertq" writes the sequence of bits from the low quad word of the source
2482	2490	operand into specified position in low quad word of the destination operand,

2488	2496	string in bits 64-69.
2489	2497
2490	2498	insertq xmm1,xmm0,4,2 ; insert 4 bits at position 2
2491		insertq xmm1,xmm0 ; insert bits defined by register
	2499	insertq xmm1,xmm0 ; insert bits defined by register
2492	2500
2493	2501	"movntss" and "movntsd" store single or double precision floating point
2494	2502	value from the source SSE register into 32-bit or 64-bit destination memory
2495	2503	location respectively, using non-temporal hint.
2496	2504
2497	2505
2498		2.1.21 AVX instructions
	2506	2.1.21 AVX instructions
2499	2507
2500	2508	The Advanced Vector Extensions introduce instructions that are new variants
2501	2509	of SSE instructions, with new scheme of encoding that allows extended syntax

2512	2520	the remaining bits of first source SSE register are copied into the the
2513	2521	destination register.
2514	2522
2515		vsubss xmm0,xmm2,xmm3 ; substract two 32-bit floats
	2523	vsubss xmm0,xmm2,xmm3 ; subtract two 32-bit floats
2516	2524	vmulsd xmm0,xmm7,qword [esi] ; multiply two 64-bit floats
2517	2525
2518	2526	In case of packed operations, each instruction can also operate on the 256-bit

2526	2534	with three operands, however they are only allowed to operate on 128-bit
2527	2535	packed types and thus cannot use the whole AVX registers.
2528	2536
2529		vpavgw xmm3,xmm0,xmm2 ; average of 16-bit integers
2530		vpslld xmm1,xmm0,1 ; shift double words left
	2537	vpavgw xmm3,xmm0,xmm2 ; average of 16-bit integers
	2538	vpslld xmm1,xmm0,1 ; shift double words left
2531	2539
2532	2540	If the SSE version of instruction had a syntax with three operands, the third
2533	2541	one being an immediate value, the AVX version of such instruction takes four
2534	2542	operands, with immediate remaining the last one.
2535	2543
2536	2544	vshufpd ymm0,ymm1,ymm2,10010011b ; shuffle 64-bit floats
2537		vpalignr xmm0,xmm4,xmm2,3 ; extract byte aligned value
	2545	vpalignr xmm0,xmm4,xmm2,3 ; extract byte aligned value
2538	2546
2539	2547	The promotion to new syntax according to the rules described above has been
2540	2548	applied to all the instructions from SSE extensions up to SSE4, with the

2545	2553	"vrsqrtps", which can operate on 256-bit data size, but retained the syntax
2546	2554	with only two operands, because they use data from only one source:
2547	2555
2548		vsqrtpd ymm1,ymm0 ; put square roots into other register
	2556	vsqrtpd ymm1,ymm0 ; put square roots into other register
2549	2557
2550	2558	In a similar way "vroundpd" and "vroundps" retained the syntax with three
2551		operands, the last one being immediate value.
	2559	operands, the last one being immediate value.
2552	2560
2553	2561	vroundps ymm0,ymm1,0011b ; round toward zero
2554
	2562
2555	2563	Also some of the operations on packed integers kept their two-operand or
2556	2564	three-operand syntax while being promoted to AVX version. In such case these
2557	2565	instructions follow exactly the same rules for operands as their SSE

2574	2582	syntax from SSE without any changes, and also allows a new form with 256-bit
2575	2583	operands in place of 128-bit ones.
2576	2584
2577		vmovups [edi],ymm6 ; store unaligned 256-bit data
	2585	vmovups [edi],ymm6 ; store unaligned 256-bit data
2578	2586
2579	2587	"vmovddup" has the identical 128-bit syntax as its SSE version, and it also
2580	2588	has a 256-bit version, which stores the duplicates of the lowest quad word

2600	2608	either low or high quad word replaced with value from second source (the
2601	2609	memory operand).
2602	2610
2603		vmovhps [esi],xmm7 ; store upper half to memory
	2611	vmovhps [esi],xmm7 ; store upper half to memory
2604	2612	vmovlps xmm0,xmm7,[ebx] ; low from memory, rest from register
2605	2613
2606	2614	"vmovss" and "vmovsd" have syntax identical to their SSE equivalents as long

2609	2617	in destination is then the value copied from first source with lowest data
2610	2618	element replaced with the lowest value from second source.
2611	2619
2612		vmovss xmm3,[edi] ; low from memory, rest zeroed
	2620	vmovss xmm3,[edi] ; low from memory, rest zeroed
2613	2621	vmovss xmm0,xmm1,xmm2 ; one value from xmm2, three from xmm1
2614	2622
2615	2623	"vcvtss2sd", "vcvtsd2ss", "vcvtsi2ss" and "vcvtsi2d" use the three-operand

2626	2634	128-bit memory as source. Analogously "vcvtpd2dq", "vcvttpd2dq" and
2627	2635	"vcvtpd2ps", in addition to variant with syntax identical to SSE version,
2628	2636	allow a variant with SSE register as destination and AVX register or 256-bit
2629		memory as source.
	2637	memory as source.
2630	2638	"vinsertps", "vpinsrb", "vpinsrw", "vpinsrd", "vpinsrq" and "vpblendw" use
2631	2639	a syntax with four operands, where destination and first source have to be SSE
2632	2640	registers, and the third and fourth operand follow the same rules as second

2646	2654	first source with some data elements replaced, according to mask, by values
2647	2655	from the second source.
2648	2656
2649		vblendvps ymm3,ymm1,ymm2,ymm7 ; blend according to mask
	2657	vblendvps ymm3,ymm1,ymm2,ymm7 ; blend according to mask
2650	2658
2651	2659	"vptest" allows the same syntax as its SSE version and also has a 256-bit
2652	2660	version, with both operands doubled in size. There are also two new

2656	2664	"vptest".
2657	2665
2658	2666	vptest ymm0,yword [ebx] ; test 256-bit values
2659		vtestpd xmm0,xmm1 ; test sign bits of 64-bit floats
	2667	vtestpd xmm0,xmm1 ; test sign bits of 64-bit floats
2660	2668
2661	2669	"vbroadcastss", "vbroadcastsd" and "vbroadcastf128" are new instructions,
2662	2670	which broadcast the data element defined by source operand into all elements

2666	2674	destination. "vbroadcastf128" requires 128-bit memory as source, and AVX
2667	2675	register as destination.
2668	2676
2669		vbroadcastss ymm0,dword [eax] ; get eight copies of value
	2677	vbroadcastss ymm0,dword [eax] ; get eight copies of value
2670	2678
2671	2679	"vinsertf128" is the new instruction, which takes four operands. The
2672	2680	destination and first source have to be AVX registers, second source can be

2687	2695	data (AVX registers). Either destination or second source has to be a memory
2688	2696	location of appropriate size, the two other operands should be registers.
2689	2697
2690		vmaskmovps [edi],xmm0,xmm5 ; conditionally store
2691		vmaskmovpd ymm5,ymm0,[esi] ; conditionally load
	2698	vmaskmovps [edi],xmm0,xmm5 ; conditionally store
	2699	vmaskmovpd ymm5,ymm0,[esi] ; conditionally load
2692	2700
2693	2701	"vpermilpd" and "vpermilps" are the new instructions with three operands
2694	2702	that permute the values from first source according to the control fields from

2713	2721	instructions. The rules for their operands remain unchanged.
2714	2722
2715	2723
2716		2.1.22 AVX2 instructions
	2724	2.1.22 AVX2 instructions
2717	2725
2718	2726	The AVX2 extension allows all the AVX instructions operating on packed integers
2719	2727	to use 256-bit data types, and introduces some new instructions as well.

2722	2730	rules became analogous to AVX instructions operating on packed floating point
2723	2731	types.
2724	2732
2725		vpsubb ymm0,ymm0,[esi] ; substract 32 packed bytes
	2733	vpsubb ymm0,ymm0,[esi] ; subtract 32 packed bytes
2726	2734	vpavgw ymm3,ymm0,ymm2 ; average of 16-bit integers
2727	2735
2728	2736	However there are some instructions that have not been equipped with the

2734	2742	amount to be SSE register or 128-bit memory location, use the same rules
2735	2743	for the third operand in their 256-bit variant.
2736	2744
2737		vpsllw ymm2,ymm2,xmm4 ; shift words left
	2745	vpsllw ymm2,ymm2,xmm4 ; shift words left
2738	2746	vpsrad ymm0,ymm3,xword [ebx] ; shift double words right
2739	2747
2740	2748	There are also new packed shift instructions with standard three-operand AVX

2749	2757	256-bit variant need memory of that size doubled or SSE register as source and
2750	2758	AVX register as destination.
2751	2759
2752		vpmovzxbq ymm0,dword [esi] ; bytes to quad words
	2760	vpmovzxbq ymm0,dword [esi] ; bytes to quad words
2753	2761
2754	2762	Also "vmovntdqa" has been upgraded with 256-bit variant, so it allows to
2755	2763	transfer 256-bit value from memory to AVX register, it needs memory address

2771	2779	element.
2772	2780
2773	2781	vpbroadcastb ymm0,byte [ebx] ; get 32 identical bytes
2774
	2782
2775	2783	"vpermd" and "vpermps" are new three-operand instructions, which use each
2776	2784	32-bit element from first source as an index of element in second source which
2777	2785	is copied into destination at position corresponding to element containing

2781	2789	indexes from the immediate value specified as third operand to determine which
2782	2790	element from source store at given position in destination. The destination
2783	2791	has to be AVX register, source can be AVX register or 256-bit memory, and the
2784		third operand must be 8-bit immediate value.
	2792	third operand must be 8-bit immediate value.
2785	2793	The family of new instructions performing "gather" operation have special
2786	2794	syntax, as in their memory operand they use addressing mode that is unique to
2787	2795	them. The base of address can be a 32-bit or 64-bit general purpose register

2837	2845	respectively.
2838	2846
2839	2847
2840		2.1.23 Auxiliary sets of computational instructions
	2848	2.1.23 Auxiliary sets of computational instructions
2841	2849
2842	2850	There is a number of additional instruction set extensions related to
2843	2851	AVX. They introduce new vector instructions (and sometimes also their SSE

2884	2892	The mnemonic of FMA instruction is obtained by appending to "vf" prefix: first
2885	2893	either "m" or "nm" to select whether result of multiplication should be taken
2886	2894	as-is or negated, then either "add" or "sub" to select whether third value
2887		will be added to the product or substracted from the product, then either
	2895	will be added to the product or subtracted from the product, then either
2888	2896	"132", "213" or "231" to select which source operands are multiplied and which
2889		one is added or substracted, and finally the type of data on which the
	2897	one is added or subtracted, and finally the type of data on which the
2890	2898	instruction operates, either "ps", "pd", "ss" or "sd". As it was with SSE
2891	2899	instructions promoted to AVX, instructions operating on packed floating point
2892	2900	values allow 128-bit or 256-bit syntax, in former all the operands are SSE

2896	2904	SSE registers, and the third operand can also be a memory, either 32-bit for
2897	2905	single precision or 64-bit for double precision.
2898	2906
2899		vfmsub231ps ymm1,ymm2,ymm3 ; multiply and substract
2900		vfnmadd132sd xmm0,xmm5,[ebx] ; multiply, negate and add
	2907	vfmsub231ps ymm1,ymm2,ymm3 ; multiply and subtract
	2908	vfnmadd132sd xmm0,xmm5,[ebx] ; multiply, negate and add
2901	2909
2902	2910	In addition to the instructions created by the rule described above, there are
2903	2911	families of instructions with mnemonics starting with either "vfmaddsub" or
2904	2912	"vfmsubadd", followed by either "132", "213" or "231" and then either "ps" or
2905	2913	"pd" (the operation must always be on packed values in this case). They add
2906		to the result of multiplication or substract from it depending on the position
	2914	to the result of multiplication or subtract from it depending on the position
2907	2915	of value in packed data - instructions from the "vfmaddsub" group add when the
2908		position is odd and substract when the position is even, instructions from the
	2916	position is odd and subtract when the position is even, instructions from the
2909	2917	"vfmsubadd" group add when the position is even and subtstract when the
2910	2918	position is odd. The rules for operands are the same as for other FMA
2911	2919	instructions.

2915	2923	out, as having separate destination operand makes such selection of operands
2916	2924	superfluous. The multiplication is always performed on values from the first
2917	2925	and second source, and then the value from third source is added or
2918		substracted. Either second or third source can be a memory operand, and the
	2926	subtracted. Either second or third source can be a memory operand, and the
2919	2927	rules for the sizes of operands are the same as for FMA instructions.
2920	2928
2921		vfmaddpd ymm0,ymm1,[esi],ymm2 ; multiply and add
2922		vfmsubss xmm0,xmm1,xmm2,[ebx] ; multiply and substract
	2929	vfmaddpd ymm0,ymm1,[esi],ymm2 ; multiply and add
	2930	vfmsubss xmm0,xmm1,xmm2,[ebx] ; multiply and subtract
2923	2931
2924	2932	The F16C extension consists of two instructions, "vcvtps2ph" and
2925	2933	"vcvtph2ps", which convert floating point values between single precision and

2942	2950	on a solitary double precision value and 32-bit for operation on a solitary
2943	2951	single precision value).
2944	2952
2945		vfrczps ymm0,[esi] ; load fractional parts
	2953	vfrczps ymm0,[esi] ; load fractional parts
2946	2954
2947	2955	"vpcmov" copies bits from either first or second source into destination
2948	2956	depending on the values of corresponding bits in the fourth operand (the

2970	2978	of comparison encoded within the instruction name by inserting the comparison
2971	2979	mnemonic after "vpcom".
2972	2980
2973		vpcomb xmm0,xmm1,xmm2,4 ; test for equal bytes
2974		vpcomgew xmm0,xmm1,[ebx] ; compare signed words
	2981	vpcomb xmm0,xmm1,xmm2,4 ; test for equal bytes
	2982	vpcomgew xmm0,xmm1,[ebx] ; compare signed words
2975	2983
2976	2984	Table 2.5 XOP comparisons
2977	2985	/-------------------------------------------\
2978		\| Code \| Mnemonic \| Description \|
	2986	\| Code \| Mnemonic \| Description \|
2979	2987	\|======\|==========\|=========================\|
2980		\| 0 \| lt \| less than \|
2981		\| 1 \| le \| less than or equal \|
2982		\| 2 \| gt \| greater than \|
2983		\| 3 \| ge \| greater than or equal \|
2984		\| 4 \| eq \| equal \|
2985		\| 5 \| neq \| not equal \|
2986		\| 6 \| false \| false \|
2987		\| 7 \| true \| true \|
	2988	\| 0 \| lt \| less than \|
	2989	\| 1 \| le \| less than or equal \|
	2990	\| 2 \| gt \| greater than \|
	2991	\| 3 \| ge \| greater than or equal \|
	2992	\| 4 \| eq \| equal \|
	2993	\| 5 \| neq \| not equal \|
	2994	\| 6 \| false \| false \|
	2995	\| 7 \| true \| true \|
2988	2996	\-------------------------------------------/
2989	2997
2990	2998	"vpermil2ps" and "vpermil2pd" set the elements in destination register to

3006	3014	64-bit results, "vphaddwd" and "vphadduwd" add pairs of words to 32-bit
3007	3015	results, "vphaddwq" and "vphadduwq" sum all words in each four-word block to
3008	3016	64-bit results, "vphadddq" and "vphaddudq" add pairs of double words to 64-bit
3009		results. "vphsubbw" substracts in each two-byte block the byte at higher
	3017	results. "vphsubbw" subtracts in each two-byte block the byte at higher
3010	3018	position from the one at lower position, and stores the result as a signed
3011	3019	16-bit value at the corresponding position in destination, "vphsubwd"
3012		substracts in each two-word block the word at higher position from the one at
3013		lower position and makes signed 32-bit results, "vphsubdq" substract in each
	3020	subtracts in each two-word block the word at higher position from the one at
	3021	lower position and makes signed 32-bit results, "vphsubdq" subtract in each
3014	3022	block of two double word the one at higher position from the one at lower
3015	3023	position and makes signed 64-bit results. Each of these instructions takes
3016	3024	two operands, the destination being SSE register, and the source being SSE
3017	3025	register or 128-bit memory.
3018	3026
3019		vphadduwq xmm0,xmm1 ; sum quadruplets of words
	3027	vphadduwq xmm0,xmm1 ; sum quadruplets of words
3020	3028
3021	3029	"vpmacsww" and "vpmacssww" multiply the corresponding signed 16-bit values
3022	3030	from the first and second source and then add the products to the parallel

3054	3062	memory (or they can be SSE registers both) and the other operands have to be
3055	3063	SSE registers.
3056	3064
3057		vpshld xmm3,xmm1,[ebx] ; shift bytes from xmm1
	3065	vpshld xmm3,xmm1,[ebx] ; shift bytes from xmm1
3058	3066
3059	3067	"vpshab", "vpshaw", "vpshad" and "vpshaq" arithmetically shift bytes, words,
3060	3068	double words or quad words. These instructions follow the same rules as the

3063	3071	shifts, but additionally allow third operand to be immediate value, in which
3064	3072	case the same amount of rotation is specified for all the elements in source.
3065	3073
3066		vprotb xmm0,[esi],3 ; rotate bytes to the left
	3074	vprotb xmm0,[esi],3 ; rotate bytes to the left
3067	3075
3068	3076	The MOVBE extension introduces just one new instruction, "movbe", which
3069	3077	swaps bytes in value from source before storing it in destination, so can

3081	3089	the first source have to be general registers, the second source can be
3082	3090	general register or memory.
3083	3091
3084		andn edx,eax,[ebx] ; bit-multiply inverted eax with memory
	3092	andn edx,eax,[ebx] ; bit-multiply inverted eax with memory
3085	3093
3086	3094	"bextr" extracts from the first source the sequence of bits using an index
3087	3095	and length specified by bit fields in the second source operand and stores

3096	3104	bits in destination to zero. The destination must be a general register,
3097	3105	the source can be general register or memory.
3098	3106
3099		blsi rax,r11 ; isolate the lowest set bit
	3107	blsi rax,r11 ; isolate the lowest set bit
3100	3108
3101	3109	"blsmsk" sets all the bits in the destination up to the lowest set bit in
3102	3110	the source, including this bit. "blsr" copies all the bits from the source to

3114	3122	"pdep" performs the reverse operation - it takes sequence of bits from the
3115	3123	first source and puts them consecutively at the positions where the bits in
3116	3124	second source are set, setting all the other bits in destination to zero.
3117		These BMI2 instructions follow the same rules for operands as "andn".
	3125	These BMI2 instructions follow the same rules for operands as "andn".
3118	3126	"mulx" is a BMI2 instruction which performs an unsigned multiplication of
3119	3127	value from EDX or RDX register (depending on the size of specified operands)
3120	3128	by the value from third operand, and stores the low half of result in the

3122	3130	it without affecting the flags. The third operand can be general register or
3123	3131	memory, and both the destination operands have to be general registers.
3124	3132
3125		mulx edx,eax,ecx ; multiply edx by ecx into edx:eax
	3133	mulx edx,eax,ecx ; multiply edx by ecx into edx:eax
3126	3134
3127	3135	"shlx", "shrx" and "sarx" are BMI2 instructions, which perform logical or
3128	3136	arithmetical shifts of value from first source by the amount specified by

3134	3142	has to be general register, the source operand can be general register or
3135	3143	memory, and the third operand has to be an immediate value.
3136	3144
3137		rorx eax,edx,7 ; rotate without affecting flags
3138
	3145	rorx eax,edx,7 ; rotate without affecting flags
	3146
3139	3147	The TBM is an extension designed by AMD to supplement the BMI set. The
3140	3148	"bextr" instruction is extended with a new form, in which second source is
3141	3149	a 32-bit immediate value. "blsic" is a new instruction which performs the

3150	3158	"tzmsk" finds the lowest set bit in value from source operand, sets all bits
3151	3159	below it to 1 and all the rest of bits to zero, then writes the result to
3152	3160	destination. "t1mskc" finds the least significant zero bit in the value from
3153		source operand, sets the bits below it to zero and all the other bits to 1,
	3161	source operand, sets the bits below it to zero and all the other bits to 1,
3154	3162	and writes the result to destination. These instructions have the same rules
3155	3163	for operands as "blsi".
3156	3164
3157	3165
3158		2.1.24 AVX-512 instructions
3159
3160		[This section has not been written yet.]
3161
3162
3163		2.1.25 Other extensions of instruction set
	3166	2.1.24 AVX-512 instructions
	3167
	3168	The AVX-512 introduces 512-bit vector registers, which extend the 256-bit
	3169	registers used by AVX and AVX2. It also extends the set of vector registers
	3170	from 16 to 32, with the additional registers "zmm16" to "zmm31", their low
	3171	256-bit portions "ymm16" to "ymm31" and their low 128-bit portions "xmm16"
	3172	to "xmm31". These additional registers can only be accessed in the long mode.
	3173
	3174	Table 2.6 New registers available in long mode with AVX-512
	3175	/------------------------------------------------------------------\
	3176	\| Size \| Registers \|
	3177	\|---------\|--------------------------------------------------------\|
	3178	\| 128-bit \| xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 \|
	3179	\| \| xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 \|
	3180	\|---------\|--------------------------------------------------------\|
	3181	\| 256-bit \| ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 \|
	3182	\| \| ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 \|
	3183	\|---------\|--------------------------------------------------------\|
	3184	\| 512-bit \| zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 \|
	3185	\| \| zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 \|
	3186	\------------------------------------------------------------------/
	3187
	3188	In addition to new operand sizes and registers, the AVX-512 introduces
	3189	a number of supplementary settings that can be included in the operands
	3190	of AVX instructions.
	3191	The destination operand of the most of AVX instructions can be followed
	3192	by the name of an opmask register enclosed in braces, this modifier
	3193	specifies a mask that decides which units of data in the destination
	3194	operand are going to be updated. The "k0" register cannot be used as a
	3195	destination mask. This setting can be further followed by "{z}" modifier
	3196	to choose that the data units not selected by mask should be zeroed
	3197	instead of leaving them unchanged.
	3198
	3199	vaddpd zmm1{k1},zmm5,zword [rsi] ; update selected floats
	3200	vaddps ymm6{k1}{z},ymm12,ymm24 ; update selected, zero other ones
	3201
	3202	When an instruction that operates on packed data has a source operand
	3203	loaded from a memory, the memory location may be just a single unit of data
	3204	and the source used for the operation is created by broadcasting this
	3205	value into all the units within the required size. To specify that such
	3206	broadcasting method is used the memory operand should be followed by one
	3207	of the "{1to2}", "{1to4}", "{1to8}", "{1to16}", "{1to32}" and "{1to64}"
	3208	modifiers, selecting the appropriate multiply of a unit.
	3209
	3210	vsubps zmm1,zmm2,dword [rsi] {1to16} ; subtract from all floats
	3211
	3212	When an instruction does not use a memory operand often an additional
	3213	operand may follow the source operands, containing the rounding mode
	3214	specifier. When an instruction has variants that operate on different
	3215	sizes of data, the rounding mode can be specified only when the
	3216	register operands are 512-bit.
	3217
	3218	vdivps zmm2,zmm3,zmm5,{ru-sae} ; round results up
	3219
	3220	Table 2.7 AVX-512 rounding modes
	3221	/----------------------------------------------------------\
	3222	\| Operand \| Description \|
	3223	\|==========\|===============================================\|
	3224	\| {rn-sae} \| round to nearest and suppress all exceptions \|
	3225	\| {rd-sae} \| round down and suppress all exceptions \|
	3226	\| {ru-sae} \| round up and suppress all exceptions \|
	3227	\| {rz-sae} \| round toward zero and suppress all exceptions \|
	3228	\----------------------------------------------------------/
	3229
	3230	Some of the instructions do not use a rounding mode but still allow
	3231	to specify the exception suppression option with "{sae}" modifier in the
	3232	additional operand.
	3233
	3234	vmaxpd zmm0,zmm1,zmm2,{sae} ; suppress all exceptions
	3235
	3236	The family of "gather" instructions in their AVX-512 variants use a new
	3237	syntax with only two operands. The opmask register takes the role which
	3238	way played by the third operand in the AVX2 syntax and it is mandatory
	3239	in this case.
	3240
	3241	vgatherdps xmm0{k1},[eax+xmm1] ; gather four floats
	3242	vgatherdpd zmm0{k3},[ymm3*8] ; gather eight doubles
	3243
	3244	The new family of "scatter" instructions perform an operation reverse to
	3245	the one of "gather". They also take two operands, the destination is a
	3246	memory with vector indexing and opmask modifier, and the source is a vector
	3247	register.
	3248
	3249	vscatterdps [eax+xmm1]{k1},xmm0 ; scatter four floats
	3250	vscatterdpd [ymm3*8]{k3},zmm0 ; scatter eight doubles
	3251
	3252
	3253	2.1.25 Other extensions of instruction set
3164	3254
3165	3255	There is a number of additional instruction set extensions recognized by flat
3166	3256	assembler, and the general syntax of the instructions introduced by those
3167	3257	extensions is provided here. For a detailed information on the operations
3168	3258	performed by them, check out the manuals from Intel (for the VMX, SMX, XSAVE,
3169		RDRAND, FSGSBASE, INVPCID, HLE and RTM extensions) or AMD (for the SVM
	3259	RDRAND, FSGSBASE, INVPCID, HLE, RTM, and MPX extensions) or AMD (for the SVM
3170	3260	extension).
3171	3261	The Virtual-Machine Extensions (VMX) provide a set of instructions for the
3172	3262	management of virtual machines. The "vmxon" instruction, which enters the VMX

3257	3347	an 8-bit immediate value as its only operand, this value is passed in the
3258	3348	highest bits of EAX to the fallback routine. "xtest" checks whether there is
3259	3349	transactional execution in progress, this instruction takes no operands.
	3350	The MPX extension adds instructions that operate on new bounds registers
	3351	and aid in checking the memory references. For some of these instructions
	3352	flat assemblers allows a special syntax that allows a fine control over their
	3353	operation, where an address of a memory operand is separated into two parts
	3354	with a comma. With "bndmk" instruction the first part of such address specifies
	3355	the lower bound and the second one the upper bound. The lower bound can be
	3356	either zero or a register, the upper bound can be any address that uses no more
	3357	than one register (multiplied by 1, 2, 4, or 8). The addressing registers need
	3358	to be 64-bit when in long mode, and 32-bit otherwise.
	3359
	3360	bndmk bnd0,[rbx,100000h] ; lower bound in register, upper directly
	3361	bndmk bnd1,[0,rbx] ; lower bound zero, upper in register
	3362
	3363	In case of "bndldx" and "bndstx", the first part of memory operand specifies an
	3364	address used to access a bound table entry, while the second part is either zero
	3365	or a register that plays a role of an additional operand for such instruction.
	3366	The address in the first part may use no more than one register and the register
	3367	cannot be multiplied by a number other than 1.
	3368
	3369	bndstx [rcx,rsi],bnd3 ; store bnd3 and rsi at rcx in the bound table
	3370	bndldx bnd2,[rcx,rsi] ; load from bound table if entry matches rsi
3260	3371
3261	3372
3262	3373	2.2 Control directives

3357	3468	defined somewhere in source:
3358	3469
3359	3470	if count>0
3360		mov cx,count
3361		rep movsb
	3471	mov cx,count
	3472	rep movsb
3362	3473	end if
3363	3474
3364	3475	These two assembly instructions will be assembled only if the "count" constant
3365	3476	is greater than 0. The next sample shows more complex conditional structure:
3366	3477
3367	3478	if count & ~ count mod 4
3368		mov cx,count/4
3369		rep movsd
	3479	mov cx,count/4
	3480	rep movsd
3370	3481	else if count>4
3371		mov cx,count/4
3372		rep movsd
3373		mov cx,count mod 4
3374		rep movsb
	3482	mov cx,count/4
	3483	rep movsd
	3484	mov cx,count mod 4
	3485	rep movsb
3375	3486	else
3376		mov cx,count
3377		rep movsb
	3487	mov cx,count
	3488	rep movsb
3378	3489	end if
3379	3490
3380	3491	The first block of instructions gets assembled when the "count" is non zero and

3422	3533	for example:
3423	3534
3424	3535	repeat 8
3425		mov byte [bx],%
3426		inc bx
	3536	mov byte [bx],%
	3537	inc bx
3427	3538	end repeat
3428	3539
3429	3540	The generated code will store byte values from one to eight in the memory

3436	3547
3437	3548	s = x/2
3438	3549	repeat 100
3439		if x/s = s
3440		break
3441		end if
3442		s = (s+x/s)/2
	3550	if x/s = s
	3551	break
	3552	end if
	3553	s = (s+x/s)/2
3443	3554	end repeat
3444	3555
3445	3556	The "while" directive repeats the block of instructions as long as the

3454	3565
3455	3566	s = x/2
3456	3567	while x/s <> s
3457		s = (s+x/s)/2
3458		if % = 100
3459		break
3460		end if
	3568	s = (s+x/s)/2
	3569	if % = 100
	3570	break
	3571	end if
3461	3572	end while
3462	3573
3463	3574	The blocks defined with "if", "repeat" and "while" can be nested in any

3503	3614	generated in current addressing space you can use such block of directives:
3504	3615
3505	3616	repeat $-$$
3506		load a byte from $$+%-1
3507		store byte a xor c at $$+%-1
	3617	load a byte from $$+%-1
	3618	store byte a xor c at $$+%-1
3508	3619	end repeat
3509	3620
3510	3621	and each byte of code will be xored with the value defined by "c" constant.

3521	3632
3522	3633	GDTR dp ?
3523	3634	virtual at GDTR
3524		GDT_limit dw ?
3525		GDT_address dd ?
	3635	GDT_limit dw ?
	3636	GDT_address dd ?
3526	3637	end virtual
3527	3638
3528	3639	It defines two labels for parts of the 48-bit variable at "GDTR" address.

3530	3641	register, for example:
3531	3642
3532	3643	virtual at bx
3533		LDT_limit dw ?
3534		LDT_address dd ?
	3644	LDT_limit dw ?
	3645	LDT_address dd ?
3535	3646	end virtual
3536	3647
3537	3648	With such definition instruction "mov ax,[LDT_limit]" will be assembled

3544	3655	example:
3545	3656
3546	3657	virtual at 0
3547		xor eax,eax
3548		and edx,eax
3549		load zeroq dword from 0
	3658	xor eax,eax
	3659	and edx,eax
	3660	load zeroq dword from 0
3550	3661	end virtual
3551	3662
3552	3663	The above piece of code will define the "zeroq" constant containing four bytes

3555	3666	For example this code:
3556	3667
3557	3668	virtual at 0
3558		file 'a.txt':10h,1
3559		load char from 0
	3669	file 'a.txt':10h,1
	3670	load char from 0
3560	3671	end virtual
3561	3672
3562	3673	loads the single byte from offset 10h in file "a.txt" into the "char"

3576	3687	has been closed:
3577	3688
3578	3689	virtual at 0
3579		hex_digits::
3580		db '0123456789ABCDEF'
	3690	hex_digits::
	3691	db '0123456789ABCDEF'
3581	3692	end virtual
3582	3693	load a byte from hex_digits:10
3583	3694

3604	3715	create the alignment yourself, like:
3605	3716
3606	3717	virtual
3607		align 16
3608		a = $ - $$
	3718	align 16
	3719	a = $ - $$
3609	3720	end virtual
3610	3721	db a dup 0
3611	3722

3619	3730	bits = 16
3620	3731	display 'Current offset is 0x'
3621	3732	repeat bits/4
3622		d = '0' + $ shr (bits-%*4) and 0Fh
3623		if d > '9'
3624		d = d + 'A'-'9'-1
3625		end if
3626		display d
	3733	d = '0' + $ shr (bits-%*4) and 0Fh
	3734	if d > '9'
	3735	d = d + 'A'-'9'-1
	3736	end if
	3737	display d
3627	3738	end repeat
3628	3739	display 13,10
3629	3740

3673	3784	Consider the following example:
3674	3785
3675	3786	if ~ defined alpha
3676		alpha:
	3787	alpha:
3677	3788	end if
3678	3789
3679	3790	The "defined" operator gives the true value when the expression following it

3695	3806	condition may make it possible to get it resolved:
3696	3807
3697	3808	if ~ defined alpha \| defined @f
3698		alpha:
3699		@@:
	3809	alpha:
	3810	@@:
3700	3811	end if
3701	3812
3702	3813	The "@f" is always the same label as the nearest "@@" symbol in the source

3708	3819	look at the blocks that has nothing more than this self-establishing:
3709	3820
3710	3821	if defined @f
3711		@@:
	3822	@@:
3712	3823	end if
3713	3824
3714	3825	This is an example of source that may have more than one solution, as both

3881	3992
3882	3993	macro stos0
3883	3994	{
3884		xor al,al
3885		stosb
	3995	xor al,al
	3996	stosb
3886	3997	}
3887	3998
3888	3999	The macroinstruction "stos0" will be replaced with these two assembly

3908	4019	macro mov op1,op2
3909	4020	{
3910	4021	if op1 in <ds,es,fs,gs,ss> & op2 in <cs,ds,es,fs,gs,ss>
3911		push op2
3912		pop op1
	4022	push op2
	4023	pop op1
3913	4024	else
3914		mov op1,op2
	4025	mov op1,op2
3915	4026	end if
3916	4027	}
3917	4028

3924	4035	macro mov op1,op2,op3
3925	4036	{
3926	4037	if op3 eq
3927		mov op1,op2
	4038	mov op1,op2
3928	4039	else
3929		mov op1,op2
3930		mov op2,op3
	4040	mov op1,op2
	4041	mov op2,op3
3931	4042	end if
3932	4043	}
3933	4044

3971	4082
3972	4083	macro stoschar [char]
3973	4084	{
3974		mov al,char
3975		stosb
	4085	mov al,char
	4086	stosb
3976	4087	}
3977	4088
3978	4089	This macroinstruction accepts unlimited number of arguments, and each one

3997	4108
3998	4109	macro movstr
3999	4110	{
4000		local move
	4111	local move
4001	4112	move:
4002		lodsb
4003		stosb
4004		test al,al
4005		jnz move
	4113	lodsb
	4114	stosb
	4115	test al,al
	4116	jnz move
4006	4117	}
4007	4118
4008	4119	Each time this macroinstruction is used, "move" will become other unique name

4027	4138	macro strtbl name,[string]
4028	4139	{
4029	4140	common
4030		label name dword
	4141	label name dword
4031	4142	forward
4032		local label
4033		dd label
	4143	local label
	4144	dd label
4034	4145	forward
4035		label db string,0
	4146	label db string,0
4036	4147	}
4037	4148
4038	4149	First argument given to this macroinstruction will become the label for table

4080	4191
4081	4192	macro jif op1,cond,op2,label
4082	4193	{
4083		cmp op1,op2
4084		j#cond label
	4194	cmp op1,op2
	4195	j#cond label
4085	4196	}
4086	4197
4087	4198	For example "jif ax,ae,10h,exit" will be assembled as "cmp ax,10h" and

4097	4208
4098	4209	macro label name
4099	4210	{
4100		label name
4101		if ~ used name
4102		display `name # " is defined but not used.",13,10
4103		end if
	4211	label name
	4212	if ~ used name
	4213	display `name # " is defined but not used.",13,10
	4214	end if
4104	4215	}
4105	4216
4106	4217	When label defined with such macro is not used in the source, macro will warn

4113	4224	macro message arg
4114	4225	{
4115	4226	if arg eqtype ""
4116		local str
4117		jmp @f
4118		str db arg,0Dh,0Ah,24h
4119		@@:
4120		mov dx,str
	4227	local str
	4228	jmp @f
	4229	str db arg,0Dh,0Ah,24h
	4230	@@:
	4231	mov dx,str
4121	4232	else
4122		mov dx,arg
	4233	mov dx,arg
4123	4234	end if
4124		mov ah,9
4125		int 21h
	4235	mov ah,9
	4236	int 21h
4126	4237	}
4127	4238
4128	4239	The above macro is designed for displaying messages in DOS programs. When the

4147	4258	{
4148	4259	macro instr op1,op2,op3
4149	4260	\{
4150		if op3 eq
4151		instr op1,op2
4152		else
4153		instr op1,op2
4154		instr op2,op3
4155		end if
	4261	if op3 eq
	4262	instr op1,op2
	4263	else
	4264	instr op1,op2
	4265	instr op2,op3
	4266	end if
4156	4267	\}
4157	4268	}
4158	4269

4184	4295	defines an alternative syntax for defining macroinstructions, which looks like:
4185	4296
4186	4297	MACRO stoschar char
4187		mov al,char
4188		stosb
	4298	mov al,char
	4299	stosb
4189	4300	ENDM
4190	4301
4191	4302	Note that symbol that has such customized definition must be defined with "fix"

4226	4337
4227	4338	struc point x,y
4228	4339	{
4229		.x dw x
4230		.y dw y
	4340	.x dw x
	4341	.y dw y
4231	4342	}
4232	4343
4233	4344	For example "my point 7,11" will define structure labeled "my", consisting of

4242	4353	struc db [data]
4243	4354	{
4244	4355	common
4245		. db data
4246		.size = $ - .
	4356	. db data
	4357	.size = $ - .
4247	4358	}
4248	4359
4249	4360	With such definition "msg db 'Hello!',13,10" will define also "msg.size"

4277	4388
4278	4389	rept 3 counter
4279	4390	{
4280		byte#counter db counter
	4391	byte#counter db counter
4281	4392	}
4282	4393
4283	4394	will generate lines:

fasm.x64 less more

Binary diff not shown

-6

source/assemble.inc less more

1377	1377	ret
1378	1378	data_bytes:
1379	1379	call define_data
1380		define_data_byte:
1381	1380	jc instruction_assembled
1382	1381	lods byte [esi]
1383	1382	cmp al,'('

1423	1422	mov [base_code],0
1424	1423	define_words:
1425	1424	call define_data
1426		define_data_word:
1427	1425	jc instruction_assembled
1428	1426	lods byte [esi]
1429	1427	cmp al,'('

1463	1461	ret
1464	1462	data_dwords:
1465	1463	call define_data
1466		define_data_dword:
1467	1464	jc instruction_assembled
1468	1465	lods byte [esi]
1469	1466	cmp al,'('

1508	1505	ret
1509	1506	data_pwords:
1510	1507	call define_data
1511		define_data_pword:
1512	1508	jc instruction_assembled
1513	1509	lods byte [esi]
1514	1510	cmp al,'('

1557	1553	ret
1558	1554	data_qwords:
1559	1555	call define_data
1560		define_data_qword:
1561	1556	jc instruction_assembled
1562	1557	lods byte [esi]
1563	1558	cmp al,'('

1579	1574	ret
1580	1575	data_twords:
1581	1576	call define_data
1582		define_data_tword:
1583	1577	jc instruction_assembled
1584	1578	lods byte [esi]
1585	1579	cmp al,'('

-1

source/avx.inc less more

640	640	cmp ah,16
641	641	jne invalid_operand_size
642	642	mov [postbyte_register],al
	643	avx_movd_reg_ready:
	644	test [rex_prefix],8
	645	jz nomem_instruction_ready
	646	cmp [code_type],64
	647	jne illegal_instruction
643	648	jmp nomem_instruction_ready
644	649	avx_movd_xmmreg:
645	650	sub [extended_code],10h

675	680	cmp ah,[mmx_size]
676	681	jne invalid_operand_size
677	682	mov bl,al
678		jmp nomem_instruction_ready
	683	jmp avx_movd_reg_ready
679	684	avx_movq_xmmreg_xmmreg:
680	685	cmp [mmx_size],8
681	686	jne invalid_operand

2110	2115	mov cl,4
2111	2116	jmp avx_pinsr_instruction_3a
2112	2117	avx_pinsrq_instruction:
	2118	cmp [code_type],64
	2119	jne illegal_instruction
2113	2120	mov cl,8
2114	2121	or [rex_prefix],8
2115	2122	avx_pinsr_instruction_3a:

2448	2455	cmp al,','
2449	2456	jne invalid_operand
2450	2457	lods byte [esi]
	2458	call get_size_operator
2451	2459	cmp al,'['
2452	2460	jne invalid_operand
2453	2461	call get_address

-1

source/version.inc less more

32	32	; cannot simply be copied and put under another distribution licence
33	33	; (including the GNU Public Licence).
34	34
35		VERSION_STRING equ "1.71.59"
	35	VERSION_STRING equ "1.71.60"
36	36
37	37	VERSION_MAJOR = 1
38	38	VERSION_MINOR = 71

-0

whatsnew.txt less more

0	0
1	1	Visit http://flatassembler.net/ for more information.
	2
	3
	4	version 1.71.60 (Feb 05, 2017)
	5
	6	[+] Updated documentation.
	7
	8	[-] Minor corrections in error detection of some AVX instruction handlers.
2	9
3	10
4	11	version 1.71.59 (Jan 20,2017)