Commit 12851b6b40522b8e36de8ab4344e72a473a801ad - ntl

New upstream version 11.3.0 Julien Puydt 5 years ago

134 changed file(s) with 11281 addition(s) and 3983 deletion(s). Raw diff Collapse all Expand all

-2

README less more

0		NTL -- a library for doing numbery theory -- version 11.0.0
1		Release date: 2018.04.07
	0	NTL -- a library for doing numbery theory -- version 11.3.0
	1	Release date: 2018.08.17
2	2
3	3	Author: Victor Shoup (victor@shoup.net)
4	4

-1

doc/BasicThreadPool.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/BasicThreadPool.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/BasicThreadPool.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2E.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2E.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2E.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2EX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2EX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2EX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2EXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2EXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2EXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2X.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2X.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2X.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2XFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2XFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2XFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/GF2XVec.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/GF2XVec.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/GF2XVec.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/HNF.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/HNF.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/HNF.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/LLL.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/LLL.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/LLL.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/Lazy.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/Lazy.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/Lazy.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/LazyTable.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/LazyTable.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/LazyTable.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/RR.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/RR.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/RR.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/SmartPtr.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/SmartPtr.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/SmartPtr.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

doc/TFT-time.jpg less more

Binary diff not shown

-1

doc/ZZ.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

293	293	<span class="Comment">// The implementation may or may not Euclid's algorithm,</span>
294	294	<span class="Comment">// but the coefficients a and t are always computed as if </span>
295	295	<span class="Comment">// it did.</span>
	296
	297	<span class="Comment">// In particular, the following inequalties should hold:</span>
	298	<span class="Comment">// \|s\| <= 1 OR \|s\| < \|b\|/(2*d)</span>
	299	<span class="Comment">// \|t\| <= 1 OR \|t\| < \|a\|/(2*d)</span>
	300
296	301
297	302
298	303	<span class="Comment">// special-purpose single-precision variants:</span>

-0

doc/ZZ.txt less more

261	261	// The implementation may or may not Euclid's algorithm,
262	262	// but the coefficients a and t are always computed as if
263	263	// it did.
	264
	265	// In particular, the following inequalties should hold:
	266	// \|s\| <= 1 OR \|s\| < \|b\|/(2*d)
	267	// \|t\| <= 1 OR \|t\| < \|a\|/(2*d)
	268
264	269
265	270
266	271	// special-purpose single-precision variants:

-1

doc/ZZVec.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZVec.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZVec.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_limbs.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_limbs.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_limbs.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_pEX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_pEX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pEX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_pEXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_pEXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pEXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_pX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_pX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/ZZ_pXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/ZZ_pXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

+31

-1

doc/config.txt less more

38	38	NTL_STD_CXX11=on # Build assuming C++11 features
39	39
40	40	NTL_SAFE_VECTORS=on # build in "safe vector" mode
	41
	42	NTL_ENABLE_AVX_FFT=off # implement the small-prime FFT using AVX
	43	# instructions...this is experimental at
	44	# moment, and may lead to worse performance
	45
	46	NTL_AVOID_AVX512=off # avoid using 512-bit AVX registers
41	47
42	48
43	49	########## Here are more detailed description of these variables.

224	230	# details.
225	231
226	232
	233	############ AVX FFT
	234
	235	NTL_ENABLE_AVX_FFT=off # implement the small-prime FFT using AVX
	236	# instructions...this is experimental at
	237	# moment, and may lead to worse performance
	238
	239	On machines with AVX2/FMA or AVX512, this will implement the small-prime FFT
	240	using AVX code. This is still quite experimental, and may lead to worse
	241	performance. While the FFT itself can run 2-3 times faster, this comes at the
	242	cost of (1) restriction to 50-bit primes (so NTL_SP_NBITS will be set to 50
	243	instead of 60), and (2) the CPU speed may be throttled, slowing down other
	244	operations. So far, it seems that the only operations that are faster are
	245	arithmetic operations in zz_pX, and only for certain choices of modulus.
	246	Arithmetic operations in ZZ_pX, with large modulus, can run slower with AVX
	247	than without.
	248
	249
	250	########### Avoid 512-bit AVX registers
	251
	252	NTL_AVOID_AVX512=off # avoid using 512-bit AVX registers
	253
	254	Even if available, this will avoid the use of 512-bit AVX registers. This
	255	affects both Mat<zz_p> operations, as well as the AVX-based FFT (see above).
	256
227	257
228	258
229	259

628	658	NTL_FFT_BIGTAB=off
629	659
630	660	# Precomputed tables are used to store all the roots of unity
631		# used in FFT computations.
	661	# used in FFT computations.
632	662
633	663
634	664	NTL_FFT_LAZYMUL=off

doc/flintrat.jpg less more

Binary diff not shown

-1

doc/lzz_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/lzz_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/lzz_pEX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_pEX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pEX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/lzz_pEXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_pEXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pEXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/lzz_pX.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_pX.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pX.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/lzz_pXFactoring.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/lzz_pXFactoring.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pXFactoring.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_GF2.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_GF2.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_GF2.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_GF2E.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_GF2E.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_GF2E.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_RR.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_RR.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_RR.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_ZZ.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_ZZ.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_ZZ_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_ZZ_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_ZZ_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_ZZ_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_lzz_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_lzz_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_lzz_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_lzz_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_lzz_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_lzz_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_poly_ZZ.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_poly_ZZ.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_ZZ.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_poly_ZZ_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_poly_ZZ_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_ZZ_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/mat_poly_lzz_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/mat_poly_lzz_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_lzz_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/matrix.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/matrix.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/matrix.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/pair.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/pair.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/pair.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/quad_float.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/quad_float.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/quad_float.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/tools.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/tools.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/tools.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

+14

-0

doc/tour-ack.html less more

71	71	for their collaboration and support over the years.
72	72
73	73
	74	<li>
	75	Thanks to
	76	<a href="http://web.maths.unsw.edu.au/~davidharvey/">David Harvey</a>
	77	for numerous improvements to NTL's FFT code. The current version
	78	of NTL's FFT is derived from code originally written by David.
	79
	80	<li>
	81	Thanks to
	82	<a href="http://personales.unican.es/taberalf/">Luis Felipe Tabera Alonso</a>
	83	for porting the fast
	84	GCD and XGCD code to <tt>GF2EX</tt>, <tt>zz_pEX</tt>, and <tt>ZZ_pEX</tt>,
	85	and for testing and tuning the code.
	86
	87
74	88	</ul>
75	89
76	90

+250

-1

doc/tour-changes.html less more

14	14	A Tour of NTL: Summary of Changes
15	15	</p>
16	16	</h1>
	17
	18
	19	<p><hr><p>
	20	<h3>
	21	2018.08.17: Changes between NTL 11.2.1 and 11.3.0
	22	</h3>
	23
	24	<ul>
	25	<li>
	26	<b>Implemented an AVX-based small-prime FFT</b> (which works with
	27	both AVX2 and AVX512)
	28	<ul>
	29	<li>
	30	This can give a 2-3x speedup for the FFT.
	31	<li>
	32	However, it is not enabled by default, because it reduces
	33	that small-prime size bound from 60 bits to 50 bits,
	34	and may slow down certain computations.
	35	<li>
	36	The reasons for this unfortunate slowdown are that some CRT-based computations
	37	may slow down because of the smaller prime size, and because
	38	Intel CPUs may slow themselves down when executing AVX
	39	instructions.
	40	<li>
	41	To enable this feature, configure with <tt>NTL_ENABLE_AVX_FFT=on</tt>.
	42	<li>
	43	Here are some running times on a Skylake Xeon machine
	44	(<tt>Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz</tt>).
	45	For various values of <i>n</i>,
	46	we measure the time to compute <i>a<sup>e</sup> </i>mod<i> f</i>,
	47	where <i>f</i> is a random monic polynomial of degree <i>n</i>
	48	over <i>Z<sub>17</sub></i>,
	49	<i>a</i> is a random polynomial of degree less than <i>n</i>,
	50	and <i>e=2<sup>n</sup>-1</i>.
	51	<pre>
	52	n 1024 2048 4096 8192 16384
	53	non-AVX 0.171 0.741 3.192 14.348 60.812
	54	AVX512 0.089 0.372 1.648 7.740 35.588
	55	</pre>
	56	</ul>
	57
	58	<p>
	59	<li>
	60	<b>Implemented AVX512 instruction sequences</b>
	61	<ul>
	62	<li>
	63	This affects <tt>Mat<zz_p></tt> arithmetic and the small-prime FFT.
	64	<li>
	65	Becuause AVX512 instructions can in certain situations
	66	lead to slower computations (because of CPU "throttling"),
	67	this feature can be disabled by configuring with
	68	<tt>NTL_AVOID_AVX512=on</tt>.
	69	</ul>
	70
	71	<p>
	72	<li>
	73	<b>Performance tuned <tt>GF2EX</tt> arithmetic</b>
	74	<ul>
	75	<li>
	76	Tuned crossovers for various algorithms.
	77	</ul>
	78
	79	<p>
	80	<li>
	81	<b>Implemented asymptotocially fast GCD and XGCD for
	82	<tt>GF2EX</tt>, <tt>zz_pEX</tt>, and <tt>ZZ_pEX</tt></b>
	83	<ul>
	84	<li>
	85	Some work may still need to be done to fine tune
	86	the crossovers, but they should be pretty good as is.
	87	<li>
	88	Many thanks to
	89	<a href="http://personales.unican.es/taberalf/">Luis Felipe Tabera Alonso</a> for porting the code,
	90	as well as testing and tuning it.
	91	</ul>
	92
	93	<p>
	94	<li>
	95	<b>Other small changes</b>
	96	<ul>
	97	<li>
	98	Restructured <tt>quad_float</tt> implemenation to isolate better
	99	the parts that are dependent on correct FP rounding.
	100	<li>
	101	Standardized vector growth rate to 1.5 via the function <tt>_ntl_vec_grow</tt>.
	102	<li>
	103	Got rid of most uses of <tt>__restrict</tt> in <tt>mat_lzz_p.cpp</tt>,
	104	some of which were technically UB.
	105	<li>
	106	Got rid of some uses of <tt>#warning</tt>, which are not portable.
	107	</ul>
	108
	109	</ul>
	110
	111	<p><hr><p>
	112	<h3>
	113	2018.07.15: Changes between NTL 11.2.0 and 11.2.1
	114	</h3>
	115
	116	<ul>
	117	<li>
	118	Fixed an embarrassing bug, introduced in NTL 11.2.0,
	119	in which <tt>add(ZZ,ZZ,long)</tt>
	120	and <tt>sub(ZZ,ZZ,long)</tt> would give incorrect result
	121	if third argument was zero.
	122	<li>
	123	Fixed incorrect libtool version number in NTL 11.2.0.
	124	</ul>
	125
	126	<p><hr><p>
	127	<h3>
	128	2018.07.07: Changes between NTL 11.1.0 and 11.2.0
	129	</h3>
	130
	131	<ul>
	132	<li>
	133	<b>Complete re-write of the
	134	Schoenhage-Strassen FFT for <tt>ZZX</tt> arithmetic.</b>
	135
	136	<ul>
	137	<li>
	138	Implementation of "truncated" FFT
	139	<li>
	140	Implementaion of "sqrt 2" trick
	141	<li>
	142	More efficient implementation of low-level butterfly operations
	143	<li>
	144	Here is some timing data comparing <tt>ZZX</tt> multiplication times
	145	of NTL 11.0 and 11.2.
	146	The entries are the ratio of the 11.0-time over the 11.2-time (so
	147	the bigger the number, the bigger the improvement).
	148	The rows are labeled by the bit-length <i>k</i> of the coefficient,
	149	the column by the degree bound <i>n</i>.
	150	Unlabeled columns represent degree bounds half-way between the labeled
	151	ones.
	152
	153
	154	<p>
	155
	156	<img src="zmulrat.jpg" border="0" style="display: block; width: 60%; height: auto;">
	157
	158	<p>
	159	<li>
	160	For multiplication in <tt>ZZX</tt>, NTL and
	161	<a href="http://www.flintlib.org">FLINT</a>
	162	now have comparable performance across a wide range
	163	of parameter sizes, with NTL being 2x faster for some parameters,
	164	and FLITNt being 1.5x faster in others.
	165	Here is a chart showing the ratio of FLINT time
	166	over NTL time (so the bigger the number, the faster NTL is relative to FLINT)
	167	<p>
	168
	169	<img src="flintrat.jpg" border="0" style="display: block; width: 60%; height: auto;">
	170
	171	<p>
	172	<li>
	173	See also this report on
	174	<a href="http://www.shoup.net/ntl/benchmarks.pdf">NTL vs FLINT</a>
	175	for detailed benchmarks that compare the performance NTL and FLINT
	176	on a number of operations and parameter settings.
	177
	178	<li>
	179	Future plans for NTL's Schoenhage-Strassen code:
	180	<ul>
	181	<li>
	182	Implement something like Bailey's 4-step variant
	183	(which should yield better cache behavior)
	184	<li>
	185	Thread boosting (built on top of the 4-step variant)
	186	</ul>
	187	</ul>
	188
	189	<li>
	190	Some fine tuning of the new small-prime
	191	truncated-FFT implementation introduced in version 11.0.
	192	<li>
	193	Fixed obscure bug in new small-prime FFT code: this only affects
	194	users who call low-level, undocumented FFT routines
	195	on transforms of size 2, so it is unlikely to have affected
	196	any real code.
	197
	198	<li>
	199	Performance improvements to <tt>ZZ+long</tt> and <tt>ZZ-long</tt>
	200	routines (and by extension <tt>ZZ+=long</tt>, <tt>ZZ-=long</tt>,
	201	<tt>ZZ++</tt>, and <tt>ZZ--</tt>)
	202
	203
	204
	205	</ul>
	206
	207	<p><hr><p>
	208	<h3>
	209	2018.06.07: Changes between NTL 11.0.0 and 11.1.0
	210	</h3>
	211
	212	<ul>
	213	<li>
	214	<b>Complete re-write of the low-level "small-prime" FFT (a.k.a., NTT).</b>
	215
	216	<ul>
	217	<li> This implements a "truncated" FFT, which can speed up
	218	polynomial multiplication by a factor of two, and which
	219	mainly eliminates "jumps" in the running time at powers of two.
	220	The new FFT routines are in fact a bit faster even at powers of two.
	221
	222	<li> Some low-level interfaces have changed, but these are
	223	all <i>undocumented</i>, so should not cause
	224	any problems for clients that don't inappropriately
	225	use such interfaces.
	226
	227
	228	<li>
	229	Here is some timing data comparing the new (truncated) FFT to
	230	the old (plain) FFT. <i>x</i>-axis is degree bound,
	231	<i>y</i>-axis is time (in seconds), shown on a log/log scale.
	232	This is the time to multiply two polynomials modulo
	233	a single-precision "FFT" prime (60 bits).
	234
	235	<p>
	236
	237	<img src="TFT-time.jpg" border="0" style="display: block; width: 40%; height: auto;">
	238
	239	</ul>
	240
	241	<p>
	242	<li>
	243	<b>Improved performance of ZZ mul and sqr on small inputs</b>
	244	<ul>
	245	<li>mul speedup: 1 limb: 2.5x; 2 limbs: 1.4x; 3 limbs: 1.3x.
	246	<li>NTL now makes explicit calls to <tt>mpn_sqr</tt> and
	247	requires GMP version 5.0 or later.
	248	</ul>
	249
	250	<p>
	251	<li>
	252	<b>Other changes:</b>
	253	<ul>
	254	<li>
	255	Changed header files to make Windows installation more reliable,
	256	especially for IDE's like Code Blocks
	257
	258	<li>
	259	Added documentation for the <tt>GCD</tt> routine in the <tt>ZZ</tt> module
	260
	261	<li>
	262	Fixed a bit of UB in the <tt>lip.h</tt> interface (<tt>_ntl_gbigint_body</tt>)
	263	</ul>
	264
	265	</ul>
17	266
18	267
19	268	<p><hr><p>

669	918
670	919
671	920	</ul>
672
	921	</ul>
673	922
674	923
675	924	<p><hr><p>

+36

-36

doc/tour-time.html less more

34	34	<p>
35	35	<pre>
36	36
37		multiply 1000-bit ints: 1.7641e-07
38		square 1000-bit ints: 1.20344e-07
39		remainder 2000/1000-bit ints: 3.59872e-07
40		gcd 1000-bit ints: 2.83256e-06
41		xgcd 1000-bit ints: 4.32945e-06
42		power mod 1000-bit ints: 0.000441862
43		multiply degree-1000 poly mod 1000-bit prime: 0.00433029
44		remainder degree-2000/1000 poly mod 1000-bit prime: 0.0125181
45		preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00441719
46		gcd degree-1000 poly mod 1000-bit prime: 0.123718
47		multiply degree-1000 int poly with 1000-bit coeffs: 0.00612337
	37	multiply 1000-bit ints: 1.77903e-07
	38	square 1000-bit ints: 1.08537e-07
	39	remainder 2000/1000-bit ints: 3.58799e-07
	40	gcd 1000-bit ints: 2.86069e-06
	41	xgcd 1000-bit ints: 4.27161e-06
	42	power mod 1000-bit ints: 0.000424325
	43	multiply degree-1000 poly mod 1000-bit prime: 0.0041019
	44	remainder degree-2000/1000 poly mod 1000-bit prime: 0.0119166
	45	preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00418589
	46	gcd degree-1000 poly mod 1000-bit prime: 0.122145
	47	multiply degree-1000 int poly with 1000-bit coeffs: 0.00467749
48	48
49	49	factoring degree-1000 poly mod 1000-bit prime...
50		square-free decomposition...0.123419
	50	square-free decomposition...0.119126
51	51	factoring multiplicity 1, deg = 1000
52		computing X^p...7.28103
53		computing DDF...generating baby steps...+++++++++++++++++++++2.78938
54		generating giant steps...++++++++++++++++++++++2.89548
	52	computing X^p...6.89619
	53	computing DDF...generating baby steps...+++++++++++++++++++++2.72505
	54	generating giant steps...++++++++++++++++++++++2.82554
55	55	giant refine...++++split 1 18
56	56	++++++++++++++++split 17 355
57	57	*split 0 627
58		giant refine time: 4.31472
	58	giant refine time: 4.09811
59	59	baby refine...split 1 1
60	60	split 8 8
61	61	split 9 9
62	62	split 355 355
63	63	split 627 627
64		baby refine time: 0.03662
65		DDF time: 10.0396
66		...total time = 17.4524
	64	baby refine time: 0.037111
	65	DDF time: 9.6903
	66	...total time = 16.7138
67	67
68		multiply 500-bit GF2Xs: 5.50411e-08
69		remainder 1000/500-bit GF2Xs: 8.22747e-07
70		gcd 500-bit GF2Xs: 3.52091e-06
	68	multiply 500-bit GF2Xs: 5.3414e-08
	69	remainder 1000/500-bit GF2Xs: 8.19842e-07
	70	gcd 500-bit GF2Xs: 3.57209e-06
71	71
72		factoring degree-500 GF2X: 0.000148627
73		gcd 500-bit GF2X: 3.54025e-06
74		multiply degree-500 poly mod 500-bit GF2X: 0.00247895
75		remainder degree-1000/500 poly mod 500-bit GF2X: 0.00889676
76		preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00500091
77		gcd degree-500 poly mod 500-bit GF2X: 0.0453614
	72	factoring degree-500 GF2X: 0.000154251
	73	gcd 500-bit GF2X: 3.55401e-06
	74	multiply degree-500 poly mod 500-bit GF2X: 0.00247313
	75	remainder degree-1000/500 poly mod 500-bit GF2X: 0.00889548
	76	preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00498747
	77	gcd degree-500 poly mod 500-bit GF2X: 0.0451361
78	78
79	79	factoring degree-500 poly mod 500-bit GF2X...
80		square-free decomposition...0.004363
	80	square-free decomposition...0.004369
81	81	factoring multiplicity 1, deg = 250
82		computing X^p...0.476299
83		computing DDF...generating baby steps...++++++++++0.32838
84		generating giant steps...+++++++++++0.351889
	82	computing X^p...0.478202
	83	computing DDF...generating baby steps...++++++++++0.329912
	84	generating giant steps...+++++++++++0.355037
85	85	giant refine...++++split 1 9
86	86	split 2 13
87	87	split 4 44
88	88	*++++split 7 73
89	89	*split 0 111
90		giant refine time: 0.229197
	90	giant refine time: 0.230542
91	91	baby refine...split 9 9
92	92	split 13 13
93	93	split 44 44
94	94	split 73 73
95	95	split 111 111
96		baby refine time: 0.001222
97		DDF time: 0.910726
	96	baby refine time: 0.001228
	97	DDF time: 0.916753
98	98
99		...total time = 1.39327
	99	...total time = 1.39667
100	100
101	101	</pre>
102	102

-1

doc/vec_GF2.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_GF2.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_GF2.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_GF2E.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_GF2E.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_GF2E.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_RR.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_RR.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_RR.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_ZZ.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_ZZ.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_ZZ_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_ZZ_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_ZZ_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_ZZ_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_lzz_p.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_lzz_p.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_lzz_p.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vec_lzz_pE.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vec_lzz_pE.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vec_lzz_pE.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/vector.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/vector.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/vector.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/version.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/version.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/version.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

-1

doc/xdouble.cpp.html less more

1	1	<html>
2	2	<head>
3	3	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4		<title>~/ntl-10.5.0test/doc/xdouble.cpp.html</title>
	4	<title>~/ntl-staging/ntl-11.0.0updated/doc/xdouble.cpp.html</title>
5	5	<meta name="Generator" content="Vim/8.0">
6	6	<meta name="plugin-version" content="vim7.4_v2">
7	7	<meta name="syntax" content="cpp">

doc/zmulrat.jpg less more

Binary diff not shown

-0

include/NTL/ALL_FEATURES.h less more

6	6	#include <NTL/HAVE_PCLMUL.h>
7	7	#include <NTL/HAVE_AVX2.h>
8	8	#include <NTL/HAVE_FMA.h>
	9	#include <NTL/HAVE_AVX512F.h>
9	10	#include <NTL/HAVE_COPY_TRAITS1.h>
10	11	#include <NTL/HAVE_COPY_TRAITS2.h>
11	12	#include <NTL/HAVE_CHRONO_TIME.h>

+99

-61

include/NTL/FFT.h less more

8	8	#include <NTL/LazyTable.h>
9	9
10	10	NTL_OPEN_NNS
	11
	12	#define NTL_PROVIDES_TRUNC_FFT
11	13
12	14	#define NTL_FFTFudge (4)
13	15	// This constant is used in selecting the correct

35	37
36	38
37	39
38		class FFTVectorPair {
39		public:
40		Vec<long> wtab_precomp;
41		Vec<mulmod_precon_t> wqinvtab_precomp;
	40	// PIPL pattern: FFTMulTabs defined in FFT.cpp
	41	class FFTMulTabs;
	42	struct FFTMulTabsDeleterPolicy {
	43	static void deleter(FFTMulTabs *p);
42	44	};
43	45
44		typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
45
46
47		class FFTMulTabs {
48		public:
49
50		FFTMultipliers MulTab[2];
51
52		};
53	46
54	47	class zz_pInfoT; // forward reference, defined in lzz_p.h
55	48

76	69	Vec<mulmod_precon_t> TwoInvPreconTable;
77	70	// mulmod preconditioning data
78	71
79		UniquePtr< FFTMulTabs > bigtab;
	72	UniquePtr< FFTMulTabs, FFTMulTabsDeleterPolicy > bigtab;
80	73
81	74	};
82	75
83		void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, bool bigtab);
	76	void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index);
84	77
85	78
86	79	#define NTL_MAX_FFTPRIMES (20000)

133	126	// allocates and initializes information for FFT prime
134	127
135	128
136		void FFT(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir);
137		// the low-level FFT routine.
138		// computes a 2^k point FFT modulo q = info.q
139		// dir == 0 => forward direction (using roots)
140		// dir == 1 => backwards direction (using inverse roots)
141
142
143
	129	void new_fft(long* A, const long* a, long k,
	130	const FFTPrimeInfo& info, long yn, long xn);
	131
	132	inline
	133	void new_fft(long* A, const long* a, long k,
	134	const FFTPrimeInfo& info)
	135	{ new_fft(A, a, k, info, 1L << k, 1L << k); }
	136
	137
	138	void new_ifft(long* A, const long* a, long k,
	139	const FFTPrimeInfo& info, long yn);
	140
	141	inline
	142	void new_ifft(long* A, const long* a, long k,
	143	const FFTPrimeInfo& info)
	144	{ new_ifft(A, a, k, info, 1L << k); }
	145
	146
	147	void new_fft_flipped(long* A, const long* a, long k,
	148	const FFTPrimeInfo& info);
	149
	150	void new_ifft_flipped(long* A, const long* a, long k,
	151	const FFTPrimeInfo& info);
144	152
145	153
146	154	inline
147	155	void FFTFwd(long* A, const long *a, long k, const FFTPrimeInfo& info)
148		// Slightly higher level interface...using the ith FFT prime
149		{
150		FFT(A, a, k, info, 0);
151		}
152
	156	{
	157	new_fft(A, a, k, info);
	158	}
	159
	160	inline
	161	void FFTFwd_trunc(long* A, const long *a, long k, const FFTPrimeInfo& info,
	162	long yn, long xn)
	163	{
	164	new_fft(A, a, k, info, yn, xn);
	165	}
	166
	167	inline
	168	void FFTFwd_trans(long* A, const long *a, long k, const FFTPrimeInfo& info)
	169	{
	170	new_ifft_flipped(A, a, k, info);
	171	}
153	172
154	173	inline
155	174	void FFTFwd(long* A, const long *a, long k, long i)
	175	// Slightly higher level interface...using the ith FFT prime
156	176	{
157	177	FFTFwd(A, a, k, *FFTTables[i]);
158	178	}
159	179
160	180	inline
161		void FFTRev(long* A, const long *a, long k, const FFTPrimeInfo& info)
162		// Slightly higher level interface...using the ith FFT prime
163		{
164		FFT(A, a, k, info, 1);
165		}
166
167		inline
168		void FFTRev(long* A, const long *a, long k, long i)
169		{
170		FFTRev(A, a, k, *FFTTables[i]);
171		}
172
173		inline
174		void FFTMulTwoInv(long* A, const long *a, long k, const FFTPrimeInfo& info)
175		{
176		VectorMulModPrecon(1L << k, A, a, info.TwoInvTable[k], info.q,
177		info.TwoInvPreconTable[k]);
178		}
179
180		inline
181		void FFTMulTwoInv(long* A, const long *a, long k, long i)
182		{
183		FFTMulTwoInv(A, a, k, *FFTTables[i]);
184		}
185
186		inline
	181	void FFTFwd_trunc(long* A, const long *a, long k, long i, long yn, long xn)
	182	// Slightly higher level interface...using the ith FFT prime
	183	{
	184	FFTFwd_trunc(A, a, k, *FFTTables[i], yn, xn);
	185	}
	186
	187	inline
	188	void FFTFwd_trans(long* A, const long *a, long k, long i)
	189	// Slightly higher level interface...using the ith FFT prime
	190	{
	191	FFTFwd_trans(A, a, k, *FFTTables[i]);
	192	}
	193
	194
	195
	196
	197	inline
187	198	void FFTRev1(long* A, const long *a, long k, const FFTPrimeInfo& info)
188		// FFTRev + FFTMulTwoInv
189		{
190		FFTRev(A, a, k, info);
191		FFTMulTwoInv(A, A, k, info);
192		}
193
194		inline
	199	{
	200	new_ifft(A, a, k, info);
	201	}
	202
	203	inline
	204	void FFTRev1_trunc(long* A, const long *a, long k, const FFTPrimeInfo& info,
	205	long yn)
	206	{
	207	new_ifft(A, a, k, info, yn);
	208	}
	209
	210	inline
	211	void FFTRev1_trans(long* A, const long *a, long k, const FFTPrimeInfo& info)
	212	{
	213	new_fft_flipped(A, a, k, info);
	214	}
	215
	216	inline
195	217	void FFTRev1(long* A, const long *a, long k, long i)
	218	// Slightly higher level interface...using the ith FFT prime
196	219	{
197	220	FFTRev1(A, a, k, *FFTTables[i]);
198	221	}
	222
	223	inline
	224	void FFTRev1_trunc(long* A, const long *a, long k, long i, long yn)
	225	// Slightly higher level interface...using the ith FFT prime
	226	{
	227	FFTRev1_trunc(A, a, k, *FFTTables[i], yn);
	228	}
	229
	230	inline
	231	void FFTRev1_trans(long* A, const long *a, long k, long i)
	232	// Slightly higher level interface...using the ith FFT prime
	233	{
	234	FFTRev1_trans(A, a, k, *FFTTables[i]);
	235	}
	236
199	237
200	238
201	239	long IsFFTPrime(long n, long& w);

+60

-0

include/NTL/FFT_impl.h less more

	0
	1	#ifndef NTL_FFT_impl__H
	2	#define NTL_FFT_impl__H
	3
	4	#include <NTL/tools.h>
	5
	6	NTL_OPEN_NNS
	7
	8	#ifdef NTL_ENABLE_AVX_FFT
	9
	10	#if (!defined(NTL_HAVE_AVX512F) && !(defined(NTL_HAVE_AVX2) && defined(NTL_HAVE_FMA)))
	11	#error "NTL_ENABLE_AVX_FFT: not supported on this platform"
	12	#endif
	13
	14	#if (defined(NTL_HAVE_AVX512F) && !defined(NTL_AVOID_AVX512))
	15	#define NTL_LG2_PDSZ (3)
	16	#else
	17	#define NTL_LG2_PDSZ (2)
	18	#endif
	19
	20	#define NTL_FFT_RDUP (NTL_LG2_PDSZ+3)
	21	#define NTL_PDSZ (1 << NTL_LG2_PDSZ)
	22
	23	#else
	24
	25	#define NTL_FFT_RDUP (4)
	26	// Currently, this should be at least 2 to support
	27	// loop unrolling in the FFT implementation
	28
	29	#endif
	30
	31	inline
	32	long FFTRoundUp(long xn, long k)
	33	{
	34	long n = 1L << k;
	35	if (xn <= 0) return n;
	36	// default truncation value of 0 gets converted to n
	37
	38	xn = ((xn+((1L << NTL_FFT_RDUP)-1)) >> NTL_FFT_RDUP) << NTL_FFT_RDUP;
	39
	40	if (k >= 10) {
	41	if (xn > n - (n >> 4)) xn = n;
	42	}
	43	else {
	44	if (xn > n - (n >> 3)) xn = n;
	45	}
	46	// truncation just a bit below n does not really help
	47	// at all, and can sometimes slow things down slightly, so round up
	48	// to n. This also takes care of cases where xn > n.
	49	// Actually, for smallish n, we should round up sooner,
	50	// at n-n/8, and for larger n, we should round up later,
	51	// at n-m/16. At least, experimentally, this is what I see.
	52
	53	return xn;
	54	}
	55
	56
	57	NTL_CLOSE_NNS
	58
	59	#endif

-0

include/NTL/GF2E.h less more

25	25	long KarCross;
26	26	long ModCross;
27	27	long DivCross;
	28	long GCDCross;
28	29
29	30	long _card_exp;
30	31	Lazy<ZZ> _card;

140	141	static long KarCross() { return GF2EInfo->KarCross; }
141	142	static long ModCross() { return GF2EInfo->ModCross; }
142	143	static long DivCross() { return GF2EInfo->DivCross; }
	144	static long GCDCross() { return GF2EInfo->GCDCross; }
143	145
144	146	static long degree() { return GF2EInfo->p.n; }
145	147

-0

include/NTL/HAVE_AVX512F.h less more

+679

-0

include/NTL/PD.h less more

	0	#ifndef NTL_PD__H
	1	#define NTL_PD__H
	2
	3	#include <NTL/tools.h>
	4	#include <immintrin.h>
	5
	6	NTL_OPEN_NNS
	7
	8
	9	template<int N>
	10	struct PD {
	11	private:
	12	PD();
	13	};
	14
	15
	16	// FIXME: should distinguish more carefully:
	17	// AVX512DQ for long/double conversions
	18	// AVX512VL for certain ops applied to shorter types:
	19	// long/double conversions and mask ops
	20	// may need to translate long/double conversions for non-AVXDQ512
	21
	22
	23
	24	//=================== PD<8> implementation ===============
	25
	26	#ifdef NTL_HAVE_AVX512F
	27
	28	template<>
	29	struct PD<8> {
	30	__m512d data;
	31
	32	enum { size = 8};
	33
	34	PD() { }
	35	PD(double x) : data(_mm512_set1_pd(x)) { }
	36	PD(__m512d _data) : data(_data) { }
	37
	38	PD(double d0, double d1, double d2, double d3,
	39	double d4, double d5, double d6, double d7)
	40	: data(_mm512_set_pd(d7, d6, d5, d4, d3, d2, d1, d0)) { }
	41
	42	static PD load(const double *p) { return _mm512_load_pd(p); }
	43
	44	// load from unaligned address
	45	static PD loadu(const double *p) { return _mm512_loadu_pd(p); }
	46	};
	47
	48	inline void
	49	load(PD<8>& x, const double *p)
	50	{ x = PD<8>::load(p); }
	51
	52	// load from unaligned address
	53	inline void
	54	loadu(PD<8>& x, const double *p)
	55	{ x = PD<8>::loadu(p); }
	56
	57	inline void
	58	store(double *p, PD<8> a)
	59	{ _mm512_store_pd(p, a.data); }
	60
	61	// store to unaligned address
	62	inline void
	63	storeu(double *p, PD<8> a)
	64	{ _mm512_storeu_pd(p, a.data); }
	65
	66	// load and convert
	67	inline void
	68	load(PD<8>& x, const long *p)
	69	{ __m512i a = _mm512_load_epi64(p); x = _mm512_cvtepi64_pd(a); }
	70
	71	// load unaligned and convert
	72	inline void
	73	loadu(PD<8>& x, const long *p)
	74	{ __m512i a = _mm512_loadu_si512(p); x = _mm512_cvtepi64_pd(a); }
	75
	76	// convert and store
	77	inline void
	78	store(long *p, PD<8> a)
	79	{ __m512i b = _mm512_cvtpd_epi64(a.data); _mm512_store_epi64(p, b); }
	80
	81	// convert and store unaligned
	82	inline void
	83	storeu(long *p, PD<8> a)
	84	{ __m512i b = _mm512_cvtpd_epi64(a.data); _mm512_storeu_si512(p, b); }
	85
	86
	87	// swap even/odd slots
	88	// e.g., 01234567 -> 10325476
	89	inline PD<8>
	90	swap2(PD<8> a)
	91	{ return _mm512_permute_pd(a.data, 0x55); }
	92
	93	// swap even/odd slot-pairs
	94	// e.g., 01234567 -> 23016745
	95	inline PD<8>
	96	swap4(PD<8> a)
	97	{ return _mm512_permutex_pd(a.data, 0x4e); }
	98
	99	// 01234567 -> 00224466
	100	inline PD<8>
	101	dup2even(PD<8> a)
	102	{ return _mm512_permute_pd(a.data, 0); }
	103
	104	// 01234567 -> 11335577
	105	inline PD<8>
	106	dup2odd(PD<8> a)
	107	{ return _mm512_permute_pd(a.data, 0xff); }
	108
	109	// 01234567 -> 01014545
	110	inline PD<8>
	111	dup4even(PD<8> a)
	112	{ return _mm512_permutex_pd(a.data, 0x44); }
	113
	114	// 01234567 -> 23236767
	115	inline PD<8>
	116	dup4odd(PD<8> a)
	117	{ return _mm512_permutex_pd(a.data, 0xee); }
	118
	119	// blend even/odd slots
	120	// 01234567, 89abcdef -> 092b4d6f
	121	inline PD<8>
	122	blend2(PD<8> a, PD<8> b)
	123	{ return _mm512_mask_blend_pd(0xaa, a.data, b.data); }
	124	// FIXME: why isn't there an intrinsic that doesn't require a mask register?
	125
	126	// blend even/odd slot-pairs
	127	// 01234567, 89abcdef -> 01ab45ef
	128	inline PD<8>
	129	blend4(PD<8> a, PD<8> b)
	130	{ return _mm512_mask_blend_pd(0xcc, a.data, b.data); }
	131	// FIXME: why isn't there an intrinsic that doesn't require a mask register?
	132
	133	// res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
	134	inline PD<8>
	135	correct_excess(PD<8> a, PD<8> b)
	136	{
	137	__mmask8 k = _mm512_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
	138	return _mm512_mask_sub_pd(a.data, k, a.data, b.data);
	139	}
	140
	141	// res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
	142	inline PD<8>
	143	correct_deficit(PD<8> a, PD<8> b)
	144	{
	145	__mmask8 k = _mm512_cmp_pd_mask(a.data, _mm512_setzero_pd(), _CMP_LT_OQ);
	146	return _mm512_mask_add_pd(a.data, k, a.data, b.data);
	147	}
	148
	149	inline void
	150	clear(PD<8>& x)
	151	{ x.data = _mm512_setzero_pd(); }
	152
	153	inline PD<8>
	154	operator+(PD<8> a, PD<8> b)
	155	{ return _mm512_add_pd(a.data, b.data); }
	156
	157	inline PD<8>
	158	operator-(PD<8> a, PD<8> b)
	159	{ return _mm512_sub_pd(a.data, b.data); }
	160
	161	inline PD<8>
	162	operator*(PD<8> a, PD<8> b)
	163	{ return _mm512_mul_pd(a.data, b.data); }
	164
	165	inline PD<8>
	166	operator/(PD<8> a, PD<8> b)
	167	{ return _mm512_div_pd(a.data, b.data); }
	168
	169	inline PD<8>&
	170	operator+=(PD<8>& a, PD<8> b)
	171	{ a = a + b; return a; }
	172
	173	inline PD<8>&
	174	operator-=(PD<8>& a, PD<8> b)
	175	{ a = a - b; return a; }
	176
	177	inline PD<8>&
	178	operator*=(PD<8>& a, PD<8> b)
	179	{ a = a * b; return a; }
	180
	181	inline PD<8>&
	182	operator/=(PD<8>& a, PD<8> b)
	183	{ a = a / b; return a; }
	184
	185	// a*b+c (fused)
	186	inline PD<8>
	187	fused_muladd(PD<8> a, PD<8> b, PD<8> c)
	188	{ return _mm512_fmadd_pd(a.data, b.data, c.data); }
	189
	190	// a*b-c (fused)
	191	inline PD<8>
	192	fused_mulsub(PD<8> a, PD<8> b, PD<8> c)
	193	{ return _mm512_fmsub_pd(a.data, b.data, c.data); }
	194
	195	// -a*b+c (fused)
	196	inline PD<8>
	197	fused_negmuladd(PD<8> a, PD<8> b, PD<8> c)
	198	{ return _mm512_fnmadd_pd(a.data, b.data, c.data); }
	199
	200	#endif
	201
	202	//=================== PD<4> implementation ===============
	203
	204	#if (defined(NTL_HAVE_AVX2) && defined(NTL_HAVE_FMA))
	205
	206	template<>
	207	struct PD<4> {
	208	__m256d data;
	209
	210	enum { size = 4};
	211
	212	PD() { }
	213	PD(double x) : data(_mm256_set1_pd(x)) { }
	214	PD(__m256d _data) : data(_data) { }
	215	PD(double d0, double d1, double d2, double d3)
	216	: data(_mm256_set_pd(d3, d2, d1, d0)) { }
	217
	218	static PD load(const double *p) { return _mm256_load_pd(p); }
	219
	220	// load from unaligned address
	221	static PD loadu(const double *p) { return _mm256_loadu_pd(p); }
	222	};
	223
	224	inline void
	225	load(PD<4>& x, const double *p)
	226	{ x = PD<4>::load(p); }
	227
	228	// load from unaligned address
	229	inline void
	230	loadu(PD<4>& x, const double *p)
	231	{ x = PD<4>::loadu(p); }
	232
	233	inline void
	234	store(double *p, PD<4> a)
	235	{ _mm256_store_pd(p, a.data); }
	236
	237	// store to unaligned address
	238	inline void
	239	storeu(double *p, PD<4> a)
	240	{ _mm256_storeu_pd(p, a.data); }
	241
	242
	243
	244
	245
	246	// The following assume all numbers are integers
	247	// in the range [0, 2^52). The idea is taken from here:
	248	// https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
	249
	250
	251	// Some of the Intel intrinsics for loading and storing packed
	252	// integers from memory require casting between long* and __m256i*.
	253	// Strictly speaking, this can break strict aliasing rules, but
	254	// this is hopefully not a problem.
	255	// See discussion here:
	256	// https://stackoverflow.com/questions/24787268/how-to-implement-mm-storeu-epi64-without-aliasing-problems
	257
	258
	259	// load and convert
	260	inline void
	261	load(PD<4>& x, const long *p)
	262	{
	263	#ifdef NTL_HAVE_AVX512F
	264	__m256i a = _mm256_load_si256((const __m256i*)p);
	265	x = _mm256_cvtepi64_pd(a);
	266	#else
	267	__m256i a = _mm256_load_si256((const __m256i*)p);
	268	a = _mm256_or_si256(a, _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
	269	x = _mm256_sub_pd(_mm256_castsi256_pd(a), _mm256_set1_pd(1L << 52));
	270	#endif
	271	}
	272
	273	// load unaligned and convert
	274	inline void
	275	loadu(PD<4>& x, const long *p)
	276	{
	277	#ifdef NTL_HAVE_AVX512F
	278	__m256i a = _mm256_loadu_si256((const __m256i*)p); x = _mm256_cvtepi64_pd(a);
	279	#else
	280	__m256i a = _mm256_loadu_si256((const __m256i*)p);
	281	a = _mm256_or_si256(a, _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
	282	x = _mm256_sub_pd(_mm256_castsi256_pd(a), _mm256_set1_pd(1L << 52));
	283	#endif
	284	}
	285
	286	// convert and store
	287	inline void
	288	store(long *p, PD<4> a)
	289	{
	290	#ifdef NTL_HAVE_AVX512F
	291	__m256i b = _mm256_cvtpd_epi64(a.data);
	292	#ifdef __clang__
	293	_mm256_store_si256((__m256i*)p, b);
	294	#else
	295	// clang doesn't define this...why??
	296	_mm256_store_epi64(p, b);
	297	#endif
	298	#else
	299	__m256d x = a.data;
	300	x = _mm256_add_pd(x, _mm256_set1_pd(1L << 52));
	301	__m256i b = _mm256_xor_si256(
	302	_mm256_castpd_si256(x),
	303	_mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
	304	_mm256_store_si256((__m256i*)p, b);
	305	#endif
	306	}
	307
	308	// convert and store unaligned
	309	inline void
	310	storeu(long *p, PD<4> a)
	311	{
	312	#ifdef NTL_HAVE_AVX512F
	313	__m256i b = _mm256_cvtpd_epi64(a.data);
	314	_mm256_storeu_si256((__m256i*)p, b);
	315	#else
	316	__m256d x = a.data;
	317	x = _mm256_add_pd(x, _mm256_set1_pd(1L << 52));
	318	__m256i b = _mm256_xor_si256(
	319	_mm256_castpd_si256(x),
	320	_mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
	321	_mm256_storeu_si256((__m256i*)p, b);
	322	#endif
	323	}
	324
	325
	326	// swap even/odd slots
	327	// e.g., 0123 -> 1032
	328	inline PD<4>
	329	swap2(PD<4> a)
	330	{ return _mm256_permute_pd(a.data, 0x5); }
	331
	332	// 0123 -> 0022
	333	inline PD<4>
	334	dup2even(PD<4> a)
	335	{ return _mm256_permute_pd(a.data, 0); }
	336
	337	// 0123 -> 1133
	338	inline PD<4>
	339	dup2odd(PD<4> a)
	340	{ return _mm256_permute_pd(a.data, 0xf); }
	341
	342	// blend even/odd slots
	343	// 0123, 4567 -> 0527
	344	inline PD<4>
	345	blend2(PD<4> a, PD<4> b)
	346	{ return _mm256_blend_pd(a.data, b.data, 0xa); }
	347
	348	// res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
	349	inline PD<4>
	350	correct_excess(PD<4> a, PD<4> b)
	351	{
	352	#ifdef NTL_HAVE_AVX512F
	353	__mmask8 k = _mm256_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
	354	return _mm256_mask_sub_pd(a.data, k, a.data, b.data);
	355	#else
	356	__m256d mask = _mm256_cmp_pd(a.data, b.data, _CMP_GE_OQ);
	357	__m256d corrected = _mm256_sub_pd(a.data, b.data);
	358	return _mm256_blendv_pd(a.data, corrected, mask);
	359	#endif
	360	}
	361
	362	// res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
	363	inline PD<4>
	364	correct_deficit(PD<4> a, PD<4> b)
	365	{
	366	#ifdef NTL_HAVE_AVX512F
	367	__mmask8 k = _mm256_cmp_pd_mask(a.data, _mm256_setzero_pd(), _CMP_LT_OQ);
	368	return _mm256_mask_add_pd(a.data, k, a.data, b.data);
	369	#else
	370	__m256d mask = _mm256_cmp_pd(a.data, _mm256_setzero_pd(), _CMP_LT_OQ);
	371	__m256d corrected = _mm256_add_pd(a.data, b.data);
	372	return _mm256_blendv_pd(a.data, corrected, mask);
	373	#endif
	374	}
	375
	376	inline void
	377	clear(PD<4>& x)
	378	{ x.data = _mm256_setzero_pd(); }
	379
	380	inline PD<4>
	381	operator+(PD<4> a, PD<4> b)
	382	{ return _mm256_add_pd(a.data, b.data); }
	383
	384	inline PD<4>
	385	operator-(PD<4> a, PD<4> b)
	386	{ return _mm256_sub_pd(a.data, b.data); }
	387
	388	inline PD<4>
	389	operator*(PD<4> a, PD<4> b)
	390	{ return _mm256_mul_pd(a.data, b.data); }
	391
	392	inline PD<4>
	393	operator/(PD<4> a, PD<4> b)
	394	{ return _mm256_div_pd(a.data, b.data); }
	395
	396	inline PD<4>&
	397	operator+=(PD<4>& a, PD<4> b)
	398	{ a = a + b; return a; }
	399
	400	inline PD<4>&
	401	operator-=(PD<4>& a, PD<4> b)
	402	{ a = a - b; return a; }
	403
	404	inline PD<4>&
	405	operator*=(PD<4>& a, PD<4> b)
	406	{ a = a * b; return a; }
	407
	408	inline PD<4>&
	409	operator/=(PD<4>& a, PD<4> b)
	410	{ a = a / b; return a; }
	411
	412	// a*b+c (fused)
	413	inline PD<4>
	414	fused_muladd(PD<4> a, PD<4> b, PD<4> c)
	415	{ return _mm256_fmadd_pd(a.data, b.data, c.data); }
	416
	417	// a*b-c (fused)
	418	inline PD<4>
	419	fused_mulsub(PD<4> a, PD<4> b, PD<4> c)
	420	{ return _mm256_fmsub_pd(a.data, b.data, c.data); }
	421
	422	// -a*b+c (fused)
	423	inline PD<4>
	424	fused_negmuladd(PD<4> a, PD<4> b, PD<4> c)
	425	{ return _mm256_fnmadd_pd(a.data, b.data, c.data); }
	426
	427
	428	//=================== PD<2> implementation ===============
	429
	430
	431	template<>
	432	struct PD<2> {
	433	__m128d data;
	434
	435	enum { size = 2};
	436
	437	PD() { }
	438	PD(double x) : data(_mm_set1_pd(x)) { }
	439	PD(__m128d _data) : data(_data) { }
	440	PD(double d0, double d1)
	441	: data(_mm_set_pd(d1, d0)) { }
	442
	443	static PD load(const double *p) { return _mm_load_pd(p); }
	444
	445	// load from unaligned address
	446	static PD loadu(const double *p) { return _mm_loadu_pd(p); }
	447	};
	448
	449	inline void
	450	load(PD<2>& x, const double *p)
	451	{ x = PD<2>::load(p); }
	452
	453	// load from unaligned address
	454	inline void
	455	loadu(PD<2>& x, const double *p)
	456	{ x = PD<2>::loadu(p); }
	457
	458	inline void
	459	store(double *p, PD<2> a)
	460	{ _mm_store_pd(p, a.data); }
	461
	462	// store to unaligned address
	463	inline void
	464	storeu(double *p, PD<2> a)
	465	{ _mm_storeu_pd(p, a.data); }
	466
	467
	468
	469
	470
	471	// The following assume all numbers are integers
	472	// in the range [0, 2^52). The idea is taken from here:
	473	// https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
	474
	475	// load and convert
	476	inline void
	477	load(PD<2>& x, const long *p)
	478	{
	479	#ifdef NTL_HAVE_AVX512F
	480	__m128i a = _mm_load_si128((const __m128i*)p);
	481	x = _mm_cvtepi64_pd(a);
	482	#else
	483	__m128i a = _mm_load_si128((const __m128i*)p);
	484	a = _mm_or_si128(a, _mm_castpd_si128(_mm_set1_pd(1L << 52)));
	485	x = _mm_sub_pd(_mm_castsi128_pd(a), _mm_set1_pd(1L << 52));
	486	#endif
	487	}
	488
	489	// load unaligned and convert
	490	inline void
	491	loadu(PD<2>& x, const long *p)
	492	{
	493	#ifdef NTL_HAVE_AVX512F
	494	__m128i a = _mm_loadu_si128((const __m128i*)p); x = _mm_cvtepi64_pd(a);
	495	#else
	496	__m128i a = _mm_loadu_si128((const __m128i*)p);
	497	a = _mm_or_si128(a, _mm_castpd_si128(_mm_set1_pd(1L << 52)));
	498	x = _mm_sub_pd(_mm_castsi128_pd(a), _mm_set1_pd(1L << 52));
	499	#endif
	500	}
	501
	502	// convert and store
	503	inline void
	504	store(long *p, PD<2> a)
	505	{
	506	#ifdef NTL_HAVE_AVX512F
	507	__m128i b = _mm_cvtpd_epi64(a.data);
	508	#ifdef __clang__
	509	_mm_store_si128((__m128i*)p, b);
	510	#else
	511	// clang doesn't define this...why??
	512	_mm_store_epi64(p, b);
	513	#endif
	514	#else
	515	__m128d x = a.data;
	516	x = _mm_add_pd(x, _mm_set1_pd(1L << 52));
	517	__m128i b = _mm_xor_si128(
	518	_mm_castpd_si128(x),
	519	_mm_castpd_si128(_mm_set1_pd(1L << 52)));
	520	_mm_store_si128((__m128i*)p, b);
	521	#endif
	522	}
	523
	524	// convert and store unaligned
	525	inline void
	526	storeu(long *p, PD<2> a)
	527	{
	528	#ifdef NTL_HAVE_AVX512F
	529	__m128i b = _mm_cvtpd_epi64(a.data);
	530	_mm_storeu_si128((__m128i*)p, b);
	531	#else
	532	__m128d x = a.data;
	533	x = _mm_add_pd(x, _mm_set1_pd(1L << 52));
	534	__m128i b = _mm_xor_si128(
	535	_mm_castpd_si128(x),
	536	_mm_castpd_si128(_mm_set1_pd(1L << 52)));
	537	_mm_storeu_si128((__m128i*)p, b);
	538	#endif
	539	}
	540
	541
	542	// res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
	543	inline PD<2>
	544	correct_excess(PD<2> a, PD<2> b)
	545	{
	546	#ifdef NTL_HAVE_AVX512F
	547	__mmask8 k = _mm_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
	548	return _mm_mask_sub_pd(a.data, k, a.data, b.data);
	549	#else
	550	__m128d mask = _mm_cmp_pd(a.data, b.data, _CMP_GE_OQ);
	551	__m128d corrected = _mm_sub_pd(a.data, b.data);
	552	return _mm_blendv_pd(a.data, corrected, mask);
	553	#endif
	554	}
	555
	556	// res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
	557	inline PD<2>
	558	correct_deficit(PD<2> a, PD<2> b)
	559	{
	560	#ifdef NTL_HAVE_AVX512F
	561	__mmask8 k = _mm_cmp_pd_mask(a.data, _mm_setzero_pd(), _CMP_LT_OQ);
	562	return _mm_mask_add_pd(a.data, k, a.data, b.data);
	563	#else
	564	__m128d mask = _mm_cmp_pd(a.data, _mm_setzero_pd(), _CMP_LT_OQ);
	565	__m128d corrected = _mm_add_pd(a.data, b.data);
	566	return _mm_blendv_pd(a.data, corrected, mask);
	567	#endif
	568	}
	569
	570	inline void
	571	clear(PD<2>& x)
	572	{ x.data = _mm_setzero_pd(); }
	573
	574	inline PD<2>
	575	operator+(PD<2> a, PD<2> b)
	576	{ return _mm_add_pd(a.data, b.data); }
	577
	578	inline PD<2>
	579	operator-(PD<2> a, PD<2> b)
	580	{ return _mm_sub_pd(a.data, b.data); }
	581
	582	inline PD<2>
	583	operator*(PD<2> a, PD<2> b)
	584	{ return _mm_mul_pd(a.data, b.data); }
	585
	586	inline PD<2>
	587	operator/(PD<2> a, PD<2> b)
	588	{ return _mm_div_pd(a.data, b.data); }
	589
	590	inline PD<2>&
	591	operator+=(PD<2>& a, PD<2> b)
	592	{ a = a + b; return a; }
	593
	594	inline PD<2>&
	595	operator-=(PD<2>& a, PD<2> b)
	596	{ a = a - b; return a; }
	597
	598	inline PD<2>&
	599	operator*=(PD<2>& a, PD<2> b)
	600	{ a = a * b; return a; }
	601
	602	inline PD<2>&
	603	operator/=(PD<2>& a, PD<2> b)
	604	{ a = a / b; return a; }
	605
	606	// a*b+c (fused)
	607	inline PD<2>
	608	fused_muladd(PD<2> a, PD<2> b, PD<2> c)
	609	{ return _mm_fmadd_pd(a.data, b.data, c.data); }
	610
	611	// a*b-c (fused)
	612	inline PD<2>
	613	fused_mulsub(PD<2> a, PD<2> b, PD<2> c)
	614	{ return _mm_fmsub_pd(a.data, b.data, c.data); }
	615
	616	// -a*b+c (fused)
	617	inline PD<2>
	618	fused_negmuladd(PD<2> a, PD<2> b, PD<2> c)
	619	{ return _mm_fnmadd_pd(a.data, b.data, c.data); }
	620
	621
	622
	623
	624	//================== PD<8>/PD<4> conversions ================
	625
	626	#ifdef NTL_HAVE_AVX512F
	627
	628	// 0123, 4567 -> 01234567
	629	inline PD<8>
	630	join(PD<4> a, PD<4> b)
	631	{
	632	__m512d c = _mm512_castpd256_pd512(a.data);
	633	return _mm512_insertf64x4(c, b.data, 1);
	634	}
	635
	636	// 01234567 -> 0123
	637	inline PD<4>
	638	get_lo(PD<8> a)
	639	{ return _mm512_extractf64x4_pd(a.data, 0); }
	640
	641	// 01234567 -> 4567
	642	inline PD<4>
	643	get_hi(PD<8> a)
	644	{ return _mm512_extractf64x4_pd(a.data, 1); }
	645
	646	#endif
	647
	648	//================== PD<4>/PD<2> conversions ================
	649
	650	// 01, 23 -> 0123
	651	inline PD<4>
	652	join(PD<2> a, PD<2> b)
	653	#if 0
	654	// some versions of gcc are buggy and don't define this function
	655	{ return _mm256_set_m128d(b.data, a.data); }
	656	#else
	657	{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(a.data), b.data, 1); }
	658	#endif
	659
	660
	661	// 0123 -> 01
	662	inline PD<2>
	663	get_lo(PD<4> a)
	664	{ return _mm256_extractf128_pd(a.data, 0); }
	665
	666	// 0123 -> 23
	667	inline PD<2>
	668	get_hi(PD<4> a)
	669	{ return _mm256_extractf128_pd(a.data, 1); }
	670
	671
	672	#endif
	673
	674
	675	NTL_CLOSE_NNS
	676
	677
	678	#endif

-0

include/NTL/REPORT_ALL_FEATURES.h less more

30	30	std::cerr << "NTL_HAVE_FMA\n";
31	31	#endif
32	32
	33	#ifdef NTL_HAVE_AVX512F
	34	std::cerr << "NTL_HAVE_AVX512F\n";
	35	#endif
	36
33	37	#ifdef NTL_HAVE_COPY_TRAITS1
34	38	std::cerr << "NTL_HAVE_COPY_TRAITS1\n";
35	39	#endif

-6

include/NTL/WordVector.h less more

35	35
36	36	#ifndef NTL_WordVectorMinAlloc
37	37	#define NTL_WordVectorMinAlloc (4)
38		#endif
39
40		// vectors are always expanded by at least this ratio
41
42		#ifndef NTL_WordVectorExpansionRatio
43		#define NTL_WordVectorExpansionRatio (1.4)
44	38	#endif
45	39
46	40	// controls initialization during input

+37

-1

include/NTL/ZZ.h less more

391	391	inline void add(ZZ& x, long a, const ZZ& b) { add(x, b, a); }
392	392
393	393
394		void sub(ZZ& x, const ZZ& a, long b);
	394	inline void sub(ZZ& x, const ZZ& a, long b)
	395	{ _ntl_gssub(a.rep, b, &x.rep); }
	396
395	397	void sub(ZZ& x, long a, const ZZ& b);
	398	// defined in ZZ.cpp
396	399
397	400	/* operator/function notation */
398	401

1756	1759
1757	1760	void InvModError(const char *s, const ZZ& a, const ZZ& n);
1758	1761
	1762	#ifdef NTL_PROVIDES_SS_LIP_IMPL
	1763
	1764	inline void
	1765	LeftRotate_lip_impl(ZZ& a, const ZZ& b, long e, const ZZ& p, long n, ZZ& scratch)
	1766	// Compute a = b * 2^e mod p, where p = 2^n+1. 0<=e<n and 0<b<p
	1767	// a may not alias p
	1768	// scratch may not alias a, b, or p
	1769	{
	1770	_ntl_leftrotate(&a.rep, &b.rep, e, p.rep, n, &scratch.rep);
	1771	}
	1772
	1773	inline void
	1774	SS_AddMod_lip_impl(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
	1775	// x = a + b mod p, where p = 2^n+1, a, b in [0, p).
	1776	// x may not alias p.
	1777	{
	1778	_ntl_ss_addmod(&x.rep, &a.rep, &b.rep, p.rep, n);
	1779	}
	1780
	1781	inline void
	1782	SS_SubMod_lip_impl(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
	1783	// x = a + b mod p, where p = 2^n+1, a, b in [0, p).
	1784	// x may not alias b or p.
	1785	{
	1786	_ntl_ss_submod(&x.rep, &a.rep, &b.rep, p.rep, n);
	1787	}
	1788
	1789	#endif
	1790
	1791
	1792
	1793
	1794
1759	1795	NTL_CLOSE_NNS
1760	1796
1761	1797

+14

-5

include/NTL/ZZ_pX.h less more

561	561	public:
562	562	long k; // a 2^k point representation
563	563	long MaxK; // maximum space allocated
	564	long len; // length of truncated FFT
564	565	long NumPrimes;
565	566	Unique2DArray<long> tbl;
566	567
567		FFTRep() : k(-1), MaxK(-1), NumPrimes(0) { }
568
569		FFTRep(const FFTRep& R) : k(-1), MaxK(-1), NumPrimes(0)
	568	FFTRep() : k(-1), MaxK(-1), len(0), NumPrimes(0) { }
	569
	570	FFTRep(const FFTRep& R) : k(-1), MaxK(-1), len(0), NumPrimes(0)
570	571	{ *this = R; }
571	572
572		FFTRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), NumPrimes(0)
	573	FFTRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), len(0), NumPrimes(0)
573	574	{ SetSize(InitK); }
574	575
575	576	FFTRep& operator=(const FFTRep& R);

578	579	};
579	580
580	581
581		void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi);
	582	void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len,
	583	long lo, long hi);
	584
	585	inline void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len)
	586	{ ToFFTRep_trunc(y, x, k, len, 0, deg(x)); }
	587
	588	inline
	589	void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
582	590	// computes an n = 2^k point convolution of x[lo..hi].
	591	{ ToFFTRep_trunc(y, x, k, 0, lo, hi); }
583	592
584	593	inline void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k)
585	594

+20

-0

include/NTL/config.h less more

224	224
225	225	#endif
226	226
	227	#if 1
	228	#define NTL_ENABLE_AVX_FFT
	229
	230	/*
	231	* This will compile NTL in a way that enables an AVX implemention
	232	* of the small-prime FFT.
	233	*/
	234
	235	#endif
	236
	237
	238	#if 0
	239	#define NTL_AVOID_AVX512
	240
	241	/*
	242	* This will compile NTL in a way that avoids 512-bit operations,
	243	* even if AVX512 is available.
	244	*/
	245
	246	#endif
227	247
228	248	#if 0
229	249	#define NTL_RANGE_CHECK

+18

-5

include/NTL/ctools.h less more

8	8
9	9	#include <NTL/PackageInfo.h>
10	10
11
12		#if (!defined(NTL_HAVE_LL_TYPE) && defined(_MSC_VER) && defined(NTL_WINPACK))
13		// for the windows distribution, for MSVC++ we assume LL_TYPE works
	11	#if (defined(__GNUC__) && (defined(__i386__) \|\| defined(__x86_64__)))
	12	#define NTL_GNUC_INTEL
	13	#endif
	14
	15	#if (!defined(NTL_HAVE_LL_TYPE) && defined(NTL_WINPACK) && (defined(_MSC_VER) \|\| defined(NTL_GNUC_INTEL)))
	16	// for the windows distribution,
	17	// we assume LL_TYPE works for MSVC++ (which is true for both x86 and ARM)
	18	// and for GNUC/Intel platforms (e.g., Code Blocks)
14	19	#define NTL_HAVE_LL_TYPE
15	20	#endif
16	21

503	508
504	509	#define NTL_AVX_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX_BYTE_ALIGN, x, type, n)
505	510
	511	#define NTL_AVX512_BYTE_ALIGN (64)
	512
	513	#define NTL_AVX512_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX512_BYTE_ALIGN, x, type, n)
	514
	515
506	516	#define NTL_DEFAULT_ALIGN (64)
507	517	// this should be big enough to satisfy any SIMD instructions,
508	518	// and it should also be as big as a cache line

572	582	}
573	583
574	584
	585	// vectors are grown by a factor of 1.5
	586	inline long _ntl_vec_grow(long n)
	587	{ return n + n/2; }
	588
575	589
576	590	template <class T>
577	591	struct _ntl_is_char_pointer

604	618
605	619
606	620
607
608		#endif
	621	#endif

+39

-7

include/NTL/lip.h less more

13	13	* but better for debugging.
14	14	*/
15	15
16		struct _ntl_gbigint_body;
	16	struct _ntl_gbigint_body {
	17	long alloc_;
	18	long size_;
	19	};
	20
17	21	typedef _ntl_gbigint_body *_ntl_gbigint;
18	22
19	23

38	42	#endif
39	43
40	44
41
42
43		#elif (NTL_LONGDOUBLE_OK && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGDOUBLE))
	45	#if (defined(NTL_ENABLE_AVX_FFT) && (NTL_SP_NBITS > 50))
	46	#undef NTL_SP_NBITS
	47	#define NTL_SP_NBITS (50)
	48	#endif
	49
	50
	51	#elif (NTL_LONGDOUBLE_OK && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGDOUBLE) && !defined(NTL_ENABLE_AVX_FFT))
44	52
45	53	#define NTL_LONGDOUBLE_SP_MULMOD
46	54

142	150	// DIRT: These are copied from lip.cpp file
143	151
144	152	inline long& _ntl_ALLOC(_ntl_gbigint p)
145		{ return (((long *) p)[0]); }
	153	{ return p->alloc_; }
146	154
147	155	inline long& _ntl_SIZE(_ntl_gbigint p)
148		{ return (((long *) p)[1]); }
	156	{ return p->size_; }
149	157
150	158	inline long _ntl_ZEROP(_ntl_gbigint p)
151	159	{

166	174
167	175	void _ntl_gsadd(_ntl_gbigint a, long d, _ntl_gbigint *b);
168	176	/* b = a + d /
	177
	178	void _ntl_gssub(_ntl_gbigint a, long d, _ntl_gbigint *b);
	179	/* b = a - d /
169	180
170	181	void _ntl_gadd(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *c);
171	182	/* c = a + b /

665	676	void
666	677	_ntl_quick_accum_end(_ntl_gbigint x);
667	678
668		#endif
	679	// special-purpose routines for SSMul in ZZX
	680
	681	#if (defined(NTL_GMP_LIP) && (NTL_ZZ_NBITS & (NTL_ZZ_NBITS-1)) == 0)
	682	// NOTE: the test (NTL_ZZ_NBITS & (NTL_ZZ_NBITS-1)) == 0
	683	// effectively checks that NTL_ZZ_NBITS is a power of two
	684
	685	#define NTL_PROVIDES_SS_LIP_IMPL
	686
	687	void
	688	_ntl_leftrotate(_ntl_gbigint a, const _ntl_gbigint b, long e,
	689	_ntl_gbigint p, long n, _ntl_gbigint *scratch);
	690
	691	void
	692	_ntl_ss_addmod(_ntl_gbigint x, const _ntl_gbigint a,
	693	const _ntl_gbigint *b, _ntl_gbigint p, long n);
	694	void
	695	_ntl_ss_submod(_ntl_gbigint x, const _ntl_gbigint a,
	696	const _ntl_gbigint *b, _ntl_gbigint p, long n);
	697	#endif
	698
	699
	700	#endif

-0

include/NTL/lzz_p.h less more

202	202
203	203	static long storage() { return sizeof(long); }
204	204
	205	static bool IsFFTPrime() { return zz_pInfo->p_info != 0; }
	206
205	207	zz_p(long a, INIT_LOOP_HOLE_TYPE) { _zz_p__rep = a; }
206	208
207	209	// for consistency

+16

-5

include/NTL/lzz_pX.h less more

571	571	public:
572	572	long k; // a 2^k point representation
573	573	long MaxK; // maximum space allocated
	574	long len; // length of truncated FFT
574	575	long NumPrimes;
575	576	UniqueArray<long> tbl[4];
576	577
577		fftRep() : k(-1), MaxK(-1), NumPrimes(0) { }
578
579		fftRep(const fftRep& R) : k(-1), MaxK(-1), NumPrimes(0)
	578	fftRep() : k(-1), MaxK(-1), len(0), NumPrimes(0) { }
	579
	580	fftRep(const fftRep& R) : k(-1), MaxK(-1), len(0), NumPrimes(0)
580	581	{ *this = R; }
581	582
582		fftRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), NumPrimes(0)
	583	fftRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), len(0), NumPrimes(0)
583	584	{ SetSize(InitK); }
584	585
585	586	fftRep& operator=(const fftRep&);

588	589	};
589	590
590	591
591		void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi);
	592
	593
	594	void TofftRep_trunc(fftRep& y, const zz_pX& x, long k, long len,
	595	long lo, long hi);
	596
	597	inline void TofftRep_trunc(fftRep& y, const zz_pX& x, long k, long len)
	598	{ TofftRep_trunc(y, x, k, len, 0, deg(x)); }
	599
	600	inline
	601	void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
592	602	// computes an n = 2^k point convolution of x[lo..hi].
	603	{ TofftRep_trunc(y, x, k, 0, lo, hi); }
593	604
594	605	inline void TofftRep(fftRep& y, const zz_pX& x, long k)
595	606

+50

-0

include/NTL/pd_FFT.h less more

	0
	1	#ifndef NTL_pd_FFT__H
	2	#define NTL_pd_FFT__H
	3
	4
	5	#include <NTL/tools.h>
	6
	7	NTL_OPEN_NNS
	8
	9
	10	// Sets control register so that rounding mode
	11	// is "down". Destructor restores control regsiter.
	12	struct CSRPush {
	13	unsigned int reg;
	14	CSRPush();
	15	~CSRPush();
	16	};
	17
	18
	19	struct pd_mod_t {
	20	double q;
	21	const double **wtab;
	22	const double **wqinvtab;
	23	const double **wtab1;
	24	const double **wqinvtab1;
	25	};
	26
	27
	28	void
	29	pd_LazyPrepMulModPrecon_impl(double bninv, const double b, double n, long len);
	30
	31	void
	32	pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	33	long yn, long xn);
	34
	35	void
	36	pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	37	long yn, long xn, double fac);
	38
	39	void
	40	pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN,
	41	const pd_mod_t& mod, long yn);
	42
	43	void
	44	pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN,
	45	const pd_mod_t& mod, long yn, double fac);
	46
	47	NTL_CLOSE_NNS
	48
	49	#endif

+57

-15

include/NTL/quad_float.h less more

84	84
85	85
86	86
	87	void quad_float_normalize(quad_float& z, const double& xhi, const double& xlo);
	88
	89	void quad_float_in_place_add(quad_float& x, const quad_float& y);
	90	void quad_float_in_place_sub(quad_float& x, const quad_float& y);
	91	void quad_float_in_place_mul(quad_float& x, const quad_float& y);
	92	void quad_float_in_place_div(quad_float& x, const quad_float& y);
	93
	94	void quad_float_in_place_negate(quad_float& x);
	95	void quad_float_in_place_sqrt(quad_float& y, double& c_ref);
	96
	97	void quad_float_PrecisionOK(long&, const double&);
	98
	99
	100
	101
87	102	#if (NTL_BITS_PER_INT < NTL_DOUBLE_PRECISION)
88	103
89	104	inline quad_float to_quad_float(int n) { return quad_float(n, 0); }

101	116
102	117
103	118
104		inline quad_float to_quad_float(double x) { return quad_float(TrueDouble(x), 0); }
	119
	120	// NOTE: for extended precision platforms, the call to TrueDouble
	121	// should remove it
	122	inline quad_float to_quad_float(double x)
	123	{ return quad_float(TrueDouble(x), 0); }
105	124
106	125	inline quad_float to_quad_float(float x)
107	126	{ return to_quad_float(double(x)); }

109	128	inline quad_float& quad_float::operator=(double x)
110	129	{ this = to_quad_float(x); return this; }
111	130
112		quad_float operator+(const quad_float&, const quad_float& );
	131
	132
	133
	134
	135	inline quad_float& operator+= (quad_float& x, const quad_float& y)
	136	{ quad_float_in_place_add(x, y); return x; }
	137	inline quad_float& operator-= (quad_float& x, const quad_float& y)
	138	{ quad_float_in_place_sub(x, y); return x; }
	139	inline quad_float& operator*= (quad_float& x, const quad_float& y)
	140	{ quad_float_in_place_mul(x, y); return x; }
	141	inline quad_float& operator/= (quad_float& x, const quad_float& y)
	142	{ quad_float_in_place_div(x, y); return x; }
	143
	144	inline quad_float operator-(const quad_float& x)
	145	{ quad_float xx = x; quad_float_in_place_negate(xx); return xx; }
	146
	147
	148
	149	inline quad_float operator+(const quad_float& x, const quad_float& y)
	150	{ quad_float xx = x; xx += y; return xx; }
	151
	152	inline quad_float operator-(const quad_float& x, const quad_float& y)
	153	{ quad_float xx = x; xx -= y; return xx; }
	154
	155	inline quad_float operator*(const quad_float& x, const quad_float& y)
	156	{ quad_float xx = x; xx *= y; return xx; }
	157
	158	inline quad_float operator/(const quad_float& x, const quad_float& y)
	159	{ quad_float xx = x; xx /= y; return xx; }
	160
	161
	162
	163
	164
	165
113	166
114	167	inline quad_float operator+(const quad_float& x, double y )
115	168	{ return x + to_quad_float(y); }

117	170	inline quad_float operator+(double x, const quad_float& y)
118	171	{ return to_quad_float(x) + y; }
119	172
120		quad_float operator-(const quad_float&, const quad_float& );
121
122	173	inline quad_float operator-(const quad_float& x, double y )
123	174	{ return x - to_quad_float(y); }
124	175
125	176	inline quad_float operator-(double x, const quad_float& y)
126	177	{ return to_quad_float(x) - y; }
127	178
128		quad_float operator*(const quad_float&, const quad_float& );
129
130	179	inline quad_float operator*(const quad_float& x, double y )
131	180	{ return x * to_quad_float(y); }
132	181
133	182	inline quad_float operator*(double x, const quad_float& y)
134	183	{ return to_quad_float(x) * y; }
135	184
136		quad_float operator/(const quad_float&, const quad_float& );
137
138	185	inline quad_float operator/(const quad_float& x, double y )
139	186	{ return x / to_quad_float(y); }
140	187
141	188	inline quad_float operator/(double x, const quad_float& y)
142	189	{ return to_quad_float(x) / y; }
143	190
144		quad_float operator-(const quad_float& x);
145
146		quad_float& operator+= (quad_float& x, const quad_float& y);
	191
	192
147	193	inline quad_float& operator += (quad_float& x, double y)
148	194	{ x += to_quad_float(y); return x; }
149	195
150		quad_float& operator-= (quad_float& x, const quad_float& y);
151	196	inline quad_float& operator-= (quad_float& x, double y)
152	197	{ x -= to_quad_float(y); return x; }
153	198
154		quad_float& operator*= (quad_float& x, const quad_float& y);
155	199	inline quad_float& operator*= (quad_float& x, double y)
156	200	{ x *= to_quad_float(y); return x; }
157	201
158		quad_float& operator/= (quad_float& x, const quad_float& y);
159	202	inline quad_float& operator/= (quad_float& x, double y)
160	203	{ x /= to_quad_float(y); return x; }
161	204

291	334
292	335	long IsFinite(quad_float *x);
293	336
294		long PrecisionOK();
295	337
296	338	quad_float ldexp(const quad_float& x, long exp);
297	339

+35

-3

include/NTL/tools.h less more

432	432	// on relative modern versions of gcc, we can
433	433	// decalare "restricted" pointers in C++
434	434
	435	// we also can use __attribute__((always_inline))
	436
	437
435	438	#define NTL_RESTRICT __restrict
	439	#define NTL_ALWAYS_INLINE __attribute__((always_inline))
436	440
437	441	#else
438	442
439	443	#define NTL_RESTRICT
440
441		#endif
	444	#define NTL_ALWAYS_INLINE
	445
	446	#endif
	447
	448
	449
	450
442	451
443	452	// A very lightly wrapped pointer than does nothing more than provide
444	453	// auto cleanup in a destructor. Use the UniquePtr class (in SmartPtr.h)

808	817	);
809	818	}
810	819
	820	inline void
	821	ll_add(ll_type& x, const ll_type& a)
	822	{
	823	__asm__ (
	824	"addq %[alo],%[xlo] \n\t"
	825	"adcq %[ahi],%[xhi]" :
	826	[xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
	827	[ahi] "rm" (a.hi), [alo] "rm" (a.lo) :
	828	"cc"
	829	);
	830	}
	831
	832
811	833
812	834
813	835	// NOTE: an optimizing compiler will remove the conditional.

923	945	x += a;
924	946	}
925	947
	948	inline void
	949	ll_add(ll_type& x, const ll_type& a)
	950	{
	951	x += a;
	952	}
	953
	954
926	955	// NOTE: shamt must be in the range 0..NTL_BITS_PER_LONG-1
927	956	template<long shamt>
928	957	unsigned long

977	1006	#define NTL_DECLARE_RELOCATABLE_WHEN(x) \
978	1007	constexpr bool DeclareRelocatableType x
979	1008
980		#if (defined(NTL_HAVE_COPY_TRAITS1) \|\| defined(_MSC_VER))
	1009	#if (defined(NTL_HAVE_COPY_TRAITS1) \|\| defined(NTL_WINPACK))
981	1010
982	1011
983	1012	// This strategy is used on compilers that fully support C++11 type traits.

987	1016	// Just to be on the safe side, I check for a trivial destructor.
988	1017
989	1018	// This strategy is checked in the CheckCOPY_TRAITS1.cpp program.
	1019
	1020	// We also use this strategy for the WINPACK distribution.
	1021	// It should work on Windows with any compiler that properly supports C++11
990	1022
991	1023
992	1024	template<class T>

-21

include/NTL/vector.h less more

39	39	#define NTL_VectorMinAlloc (4)
40	40	#endif
41	41
42		// vectors are always expanded by at least this ratio
43
44		#ifndef NTL_VectorExpansionRatio
45		#define NTL_VectorExpansionRatio (1.4)
46		#endif
47
48	42	// controls initialization during input
49	43
50	44	#ifndef NTL_VectorInputBlock

112	106	guard.relax();
113	107	}
114	108
115		template<class T>
116		void BlockMoveConstructFromVec(T* p, long n, const T* q)
117		{
118		long i;
119
120		NTL_SCOPE(guard) { default_BlockDestroy(p, i); };
121
122		for (i = 0; i < n; i++)
123		(void) new(&p[i]) T(q[i]);
124
125		guard.relax();
126		}
127
128	109
129	110	template<class T>
130	111	void BlockConstructFromVec(T* p, long n, const T* q) { default_BlockConstructFromVec(p, n, q); }
131
132	112
133	113
134	114

681	661	NTL_VEC_HEAD(_vec__rep)->fixed = 0;
682	662	}
683	663	else if (n > NTL_VEC_HEAD(_vec__rep)->alloc) {
684		m = max(n, long(NTL_VectorExpansionRatio*NTL_VEC_HEAD(_vec__rep)->alloc));
	664	m = max(n, _ntl_vec_grow(NTL_VEC_HEAD(_vec__rep)->alloc));
685	665	m = ((m+NTL_VectorMinAlloc-1)/NTL_VectorMinAlloc) * NTL_VectorMinAlloc;
686	666
687	667	ReAllocate(m, VecStrategy<NTL_RELOC_TAG>());

-2

include/NTL/version.h less more

1	1	#ifndef NTL_version__H
2	2	#define NTL_version__H
3	3
4		#define NTL_VERSION "11.0.0"
	4	#define NTL_VERSION "11.3.0"
5	5
6	6	#define NTL_MAJOR_VERSION (11)
7		#define NTL_MINOR_VERSION (0)
	7	#define NTL_MINOR_VERSION (3)
8	8	#define NTL_REVISION (0)
9	9
10	10	#endif

+107

-0

src/CheckAVX512F.cpp less more

	0
	1	#include <NTL/ctools.h>
	2
	3	#include <cstdlib>
	4	#include <immintrin.h>
	5	#include <iostream>
	6
	7	// Ths actually checks for AVX512F+DQ+VL
	8
	9
	10	#if (!defined(__GNUC__) \|\| !defined(__x86_64__) \|\| !defined(__AVX512F__))
	11	#error "AVX512F not supported"
	12	#endif
	13
	14	#if (!defined(__AVX512VL__) \|\| !defined(__AVX512DQ__))
	15	#error "AVX512F not supported"
	16	#endif
	17
	18	#if (NTL_BITS_PER_LONG != 64 \|\| NTL_BITS_PER_INT != 32 \|\| NTL_DOUBLE_PRECISION != 53)
	19	#error "AVX512F not supported"
	20	// sanity check -- code that uses this feature also relies on this
	21	#endif
	22
	23	#ifndef NTL_HAVE_ALIGNED_ARRAY
	24	#error "AVX512F not supported"
	25	#endif
	26
	27	using namespace std;
	28
	29
	30	void fun(double * x, const double a, const double b)
	31	{
	32	__m512d xvec, avec, bvec, cvec;
	33
	34	avec = _mm512_load_pd(a);
	35	bvec = _mm512_load_pd(b);
	36	xvec = _mm512_load_pd(x);
	37
	38	xvec = _mm512_fmadd_pd(avec, bvec, xvec);
	39
	40	_mm512_store_pd(x, xvec);
	41	}
	42
	43	void fun1(double x, const long p)
	44	{
	45	__m256i a = _mm256_load_si256((const __m256i*)p);
	46	_mm256_store_pd(x, _mm256_cvtepi64_pd(a));
	47	}
	48
	49
	50	int main()
	51	{
	52	NTL_AVX512_LOCAL_ARRAY(vp, double, 24);
	53
	54	double a = vp + 08;
	55	double b = vp + 18;
	56	double x = vp + 28;
	57
	58	a[0] = atoi("1");
	59	a[1] = atoi("2");
	60	a[2] = atoi("3");
	61	a[3] = atoi("4");
	62	a[4] = atoi("5");
	63	a[5] = atoi("6");
	64	a[6] = atoi("7");
	65	a[7] = atoi("8");
	66
	67	b[0] = atoi("2");
	68	b[1] = atoi("3");
	69	b[2] = atoi("4");
	70	b[3] = atoi("5");
	71	b[4] = atoi("6");
	72	b[5] = atoi("7");
	73	b[6] = atoi("8");
	74	b[7] = atoi("9");
	75
	76	x[0] = atoi("3");
	77	x[1] = atoi("4");
	78	x[2] = atoi("5");
	79	x[3] = atoi("6");
	80	x[4] = atoi("7");
	81	x[5] = atoi("8");
	82	x[6] = atoi("9");
	83	x[7] = atoi("10");
	84
	85	fun(x, a, b);
	86
	87	NTL_AVX_LOCAL_ARRAY(lp, long, 4);
	88	NTL_AVX_LOCAL_ARRAY(dp, double, 4);
	89
	90	lp[0] = atoi("1");
	91	lp[1] = atoi("2");
	92	lp[2] = atoi("3");
	93	lp[3] = atoi("4");
	94
	95	fun1(dp, lp);
	96
	97	if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26 &&
	98	x[4] == 37 && x[5] == 50 && x[6] == 65 && x[7] == 82 &&
	99	dp[0] == 1 && dp[1] == 2 && dp[2] == 3 && dp[3] == 4)
	100	return 0;
	101	else
	102	return -1;
	103	}
	104
	105
	106

-1

src/CheckThreads.cpp less more

31	31
32	32	#if (defined(NTL_THREADS) && defined(NTL_TLS_HACK))
33	33
34		#warning "TLS_HACK=on"
35	34
36	35	namespace details_pthread {
37	36

-1

src/DIRNAME less more

0		ntl-11.0.0
	0	ntl-11.3.0

-0

src/DispSettings.cpp less more

97	97	cout << "NTL_SAFE_VECTORS\n";
98	98	#endif
99	99
	100	#ifdef NTL_ENABLE_AVX_FFT
	101	cout << "NTL_ENABLE_AVX_FFT\n";
	102	#endif
	103
	104	#ifdef NTL_AVOID_AVX512
	105	cout << "NTL_AVOID_AVX512\n";
	106	#endif
	107
100	108	#ifdef NTL_RANGE_CHECK
101	109	cout << "NTL_RANGE_CHECK\n";
102	110	#endif

-1

src/DoConfig less more

77	77	'NTL_CLEAN_PTR' => 'on',
78	78	'NTL_SAFE_VECTORS' => 'on',
79	79	'NTL_RANGE_CHECK' => 'off',
	80	'NTL_ENABLE_AVX_FFT' => 'off',
	81	'NTL_AVOID_AVX512' => 'off',
80	82
81	83
82	84	'NTL_SPMM_ULL' => 'off',

620	622	($config_info =~ /$(.?),(.?),(.*?)$/) or die "Error: GenConfigInfo failed";
621	623
622	624	# convert to number
623		$language_standard += 0 or die "Error: GenConfigInfo failed";
	625	$language_standard += 0 or Warning("__cplusplus not correctly defined");
624	626
625	627	print("compiler_name=$compiler_name\n");
626	628	print("language_standard=$language_standard\n");

+2479

-1568

src/FFT.cpp less more

0	0
1	1	#include <NTL/FFT.h>
	2	#include <NTL/FFT_impl.h>
	3
	4	#ifdef NTL_ENABLE_AVX_FFT
	5	#include <NTL/SmartPtr.h>
	6	#include <NTL/pd_FFT.h>
	7	#endif
2	8
3	9
4	10	/********************************************************************
5	11
6	12	This is an implementation of a "small prime" FFT, which lies at the heart of
7		the ZZ_pX arithmetic, as well as some other applications.
8
9		The basic algorithm is loosely based on the routine in the Cormen, Leiserson,
10		Rivest, and Stein book on algorithms.
11
12
13		CACHE PERFORMANCE
14
15		Some attention has been paid to cache performance, but there is still more that
16		could be done.
17
18
19		The bit-reverse-copy (BRC) algorithm is a simple table-driven algorithm up to
20		a certain theshold, and then switches to the COBRA algorithm from Carter and
21		Gatlin, "Towards an optimal bit-reversal permutation algorithm", FOCS 1998.
22		I've found that COBRA helps, but not much: just 5-10%. I've also found that
23		getting rid of BRC altogether leads to another 5-10% improvement. These
24		numbers are based on experiments with 2^{17}- and 2^{19}-point FFTs, looping
25		over 50 different primes on a Core 2 Duo machine.
26
27		One could get rid of bit-reverse-copy altogether. The current FFT routines all
28		implement what is called Decimation-In-Time (DIT), which means that inputs are
29		bit reversed. One can also implement the FFT using Decimation-In-Frequency
30		(DIF), which means that the outputs are bit reversed. One can get rid of the
31		bit reversals for doing convolutions by simply doing the forward FFT using
32		DIF-FFT and and the reverse FFT using DIT-FFT. This would allow one to simply
33		eliminate all of the bit-reversal steps, which would lead to some nontrivial
34		savings. However, there are a few places in NTL where I rely on the ordering
35		of elements within an FFTRep to be their "natural ordering". The reduce and
36		AddExpand routines in ZZ_pX come to mind (which actually may become simpler),
37		along with like RevToFFTRep and RevFromFFTRep (which may be trickier). Anyway,
38		because BRC doesn't seem to be a big problem right now, it doesn't seem worth
39		worrying about this.
40
41
42		Within the FFT algorithm itself, I have not tried anything like Bailey's 4-step
43		algorithm. Maybe this should be tested. However, I somehow doubt that
44		anything more than modest gains will be achieved, since most modern processors
45		now employ a kind of memory prefetching technique, to keep the cache filled
46		with memory locations that are likely to be used next. Moreover, the FFT
47		algorithm used here accesses memory for the most part in small, sequential
48		strides, which meshes well with hardware prefetching. The paper "Algorithms to
49		Take Advantage of Hardware Prefetching" [Pan, Cherng, Dick, Ladner, Workshop on
50		Algorithm Engineering and Experiments, 2007] contains some interesting
51		experiments and useful background information. Anyway, there is still room for
52		more experimentation.
53
	13	ZZ_pX and zz_pX arithmetic, and impacts many other applications as well
	14	(such as arithmetic in ZZ_pEX, zz_pEX, and ZZX).
	15
	16	The algorithm is a Truncated FFT based on code originally developed by David
	17	Harvey. David's code built on the single-precision modular multiplication
	18	technique introduced in NTL many years ago, but also uses a "lazy
	19	multiplication" technique, which reduces the number of "correction" steps that
	20	need to be performed in each butterfly (see below for more details). It also
	21	implements a version of the Truncated FFT algorithm introduced by Joris van der
	22	Hoeven at ISSAC 2004. Also see "A cache-friendly truncated FFT", David Harvey,
	23	Theoretical Computer Science Volume 410, Issues 27-29, 28 June 2009, Pages
	24	2649-2658.
	25
	26	I have almost completely re-written David's original code to make it fit into
	27	NTL's software framework; however, all all of the key logic is still based on
	28	David's code. David's original code also implemented a 2D transformation which
	29	is more cache friendly for very large transforms. However, my experimens
	30	indicated this was only beneficial for transforms of size at least 2^20, and so
	31	I did not incorporate this variant.
	32
	33	Here is the Copyright notice from David's original code:
	34
	35
	36	==============================================================================
	37
	38	fft62: a library for number-theoretic transforms
	39
	40	Copyright (C) 2013, David Harvey
	41
	42	All rights reserved.
	43
	44	Redistribution and use in source and binary forms, with or without
	45	modification, are permitted provided that the following conditions are met:
	46
	47	* Redistributions of source code must retain the above copyright notice, this
	48	list of conditions and the following disclaimer.
	49	* Redistributions in binary form must reproduce the above copyright notice,
	50	this list of conditions and the following disclaimer in the documentation
	51	and/or other materials provided with the distribution.
	52
	53	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	54	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	55	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	56	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	57	FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	58	DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	59	SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	60	CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	61	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	62	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	63
	64
	65	==============================================================================
54	66
55	67
56	68	SINGLE-PRECISION MODULAR ARITHMETIC

81	93	necessary. To be more portable, some of these computations should really be
82	94	done using unsigned arithmetic, but that is not so important here. Also, the
83	95	adjustment steps can be replaced by simple non-branching instrictions sequences
84		involving SHIFT, AND, and ADD/SUB instructions. On many modern machines, this
85		is usually faster and NTL uses this non-branching strategy.
	96	involving SHIFT, AND, and ADD/SUB instructions. On some modern machines, this
	97	is usually faster and NTL uses this non-branching strategy. However, on other
	98	machines (modern x86's are an example of this), conditional move instructions
	99	can be used in place of branching, and this code can be faster than the
	100	non-branching code. NTL's performance-tuning script will figure out the best
	101	way to do this.
	102
86	103
87	104	Other simple optimizations can be done, such as precomputing 1/double(n) when n
88	105	remains fixed for many computations, as is often the case.

110	127	details about it, but since then, it has come to be known as "Shoup
111	128	multiplcation" in a few papers, so I'll accept that. :-) The paper "Faster
112	129	arithmetic for number-theoretic transforms" [David Harvey, J. Symb. Comp. 60
113		(2014) 113–119] seems to be the first place where it is discussed in detail,
	130	(2014)] seems to be the first place where it is discussed in detail,
114	131	and Harvey's paper also contains some improvements which I discuss below.
115	132
116	133	The basic idea is that in many computations, not only n, but one of the

184	201	not really be necessary (assuming that computing both high and low words of a
185	202	doube-wprd product is no more expensive than just computing the low word).
186	203	However, none of the compilers I've used have been able to perform that
187		optimization.
	204	optimization (in NTL v11.1, I added code that hand-codes this optimization).
188	205
189	206
190	207	64-BIT MACHINES
191	208
192		FIXME: this discussion is out of date.
193	209	Current versions of NTL use (by default) 60-bit moduli based
194	210	on all-integer arithemtic.
195	211
196	212
197	213	Prior to v9.0 of NTL, on 64 bits, the modulus n was restricted to 50 bits, in
198	214	order to allow the use of double-precision techniques, as double's have 53 bits
199		of precisions. However, since the x86-64 is such an importnat target, and the
200		one can still access the old x87 FPU, which provided 64-bit precision, the
201		bound on n on such platforms is now 60 bits. Actually, 62 bits could be
202		supported, but other things (namely, the TBL_REM implementation in
203		lip.cpp) start to slow down if 62 bits are used, so 60 seems like a good
204		compromose. Currently, 60-bit moduli are available only when using gcc on
205		x86-64 machines, and when compiling NTL with GMP.
206
207		Now, the FPU-based multiplies are in fact a bit slower than the SSE-based
208		multiplies. However, with the preconditioned all-integer MulMod's now used
209		extensively on almost all critical paths within NTL, this does not really
210		matter, and in fact, many things get faster with the wider moduli, so overall,
211		it is a net performance gain.
	215	of precision. However, NTL now supports 60-bit moduli. Actually, 62 bits can
	216	be supported by setting the NTL_MAXIMIZE_SP_NBITS configuraton flag, but other
	217	things (namely, the TBL_REM implementation in lip.cpp) start to slow down if 62
	218	bits are used, so 60 seems like a good compromise. Currently, 60-bit moduli
	219	are available only when compiling NTL with GMP, and when some kind of extended
	220	integer of floating point arithmetic is available.
212	221
213	222
214	223	FUTURE TRENDS
215	224
216		In the future, I might also experiment with other MulMod techniques, such as
217		those described in "Improved Division by Invariant Integers" [Moeller,
218		Granlund, IEEE Transactions on Computers, June 2010]. This might allow for,
219		say, 60-bit moduli on 64-bit machines that don't have extended double
220		precision. It is not clear how the performance of this would compare with the
221		floating-point methods; however, it probably doesn't matter too much, as the
222		preconditioned MulMod's are the most important ones.
223
224		It might also be useful to go back and reconsider Montgomery multiplication, at
225		least for "internal" use, like the FFT. However, I doubt that this will help
226		significantly.
227
228		As mentioned above, it could be useful to experiment with more cache-friendly
229		variants of the FFT, like Bailey's 4-step method. I could also experiment with
230		using the DIF/DIT. This affects some code outside of FFT as well (in ZZ_pX and
231		zz_pX, like reduce and AddeExpand), but should not affect any documented
232		interfaces.
233
234		Another direction to consider is exploiting concurrency. Besides using
235		multiple cores to parallelize things at a higher level, it would be nice to
236		exploit newer SIMD instructions. Unfortunately, as of now (early 2015), these
237		don't seem to have the functionality I need. A 64-bit x 64-bit -> low order
238		64-bit instruction is supposed to be available soon in the new AVX-512
239		instruction set. That would be a good start, but I would really like to get
240		the high-order 64-bits too. Maybe that will come someday. In the mean time, it
241		might be fun to experiment with using the AVX-512 instructions that will be
242		available, which will at least allow at least a floating-point-based
243		implementation, or an all-integer implementation with emulated MulHi. I have
244		no idea how performance will compare.
245
	225
	226	* The following papers
	227
	228	https://eprint.iacr.org/2017/727
	229	https://eprint.iacr.org/2016/504
	230	https://eprint.iacr.org/2015/382
	231
	232	present FFTs that access the pre-computed tables in a somewhat more efficent
	233	fashion, so that we only need to read from the tables O(n) times, rather than
	234	O(n log n) times.
	235
	236	I've partially implemented this, and have gotten mixed results.
	237	For smallish FFT's (below k=10 or 11), this code is somewhat slower.
	238	For larger FFT's (say, k=17), I see a speedup of 3-10%.
246	239
247	240
248	241	********************************************************************/
249	242
250	243
251	244
252
253
254
255
256		// #define NTL_BRC_TEST
257		// Flag to test the cost of "bit reverse copy"
258
259
260		#define NTL_FFT_BIGTAB_LIMIT (200)
261		#ifndef NTL_BRC_TEST
	245	#define NTL_FFT_BIGTAB_LIMIT (180)
262	246	#define NTL_FFT_BIGTAB_MAXROOT (17)
263		#else
264		#define NTL_FFT_BIGTAB_MAXROOT (25)
265		#endif
266		// big tables are only used for the first NTL_FFT_BIGTAB_LIMIT primes,
267		// and then only for k-values at most NTL_FFT_BIGTAB_MAXROOT
	247	#define NTL_FFT_BIGTAB_MINROOT (7)
	248
	249	// table sizes are bounded by 2^bound, where
	250	// bound = NTL_FFT_BIGTAB_MAXROOT-index/NTL_FFT_BIGTAB_LIMIT.
	251	// Here, index is the index of an FFT prime, or 0 for a user FFT prime.
	252	// If bound <= NTL_FFT_BIGTAB_MINROOT, then big tables are not used,
	253	// so only the first
	254	// (NTL_FFT_BIGTAB_MAXROOT-NTL_FFT_BIGTAB_MINROOT)*NTL_FFT_BIGTAB_LIMIT
	255	// FFT primes will have big tables.
268	256
269	257	// NOTE: in newer versions of NTL (v9.1 and later), the BIGTAB
270	258	// code is only about 5-15% faster than the non-BIGTAB code, so
271	259	// this is not a great time/space trade-off.
	260	// However, some futher optimizations may only be implemented
	261	// if big tables are used.
272	262
273	263	// NOTE: NTL_FFT_BIGTAB_MAXROOT is set independently of the parameter
274	264	// NTL_FFTMaxRoot defined in FFT.h (and which is typically 25).

279	269
280	270
281	271	NTL_START_IMPL
	272
	273
	274
	275	class FFTVectorPair {
	276	public:
	277	Vec<long> wtab_precomp;
	278	Vec<mulmod_precon_t> wqinvtab_precomp;
	279	};
	280
	281	typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
	282
	283
	284	#ifdef NTL_ENABLE_AVX_FFT
	285	class pd_FFTVectorPair {
	286	public:
	287	AlignedArray<double> wtab_precomp;
	288	AlignedArray<double> wqinvtab_precomp;
	289	};
	290
	291	typedef LazyTable<pd_FFTVectorPair, NTL_FFTMaxRoot+1> pd_FFTMultipliers;
	292	#endif
	293
	294
	295
	296	class FFTMulTabs {
	297	public:
	298
	299	#ifndef NTL_ENABLE_AVX_FFT
	300	long bound;
	301	FFTMultipliers MulTab;
	302	#else
	303	pd_FFTMultipliers pd_MulTab[2];
	304	#endif
	305
	306	};
	307
	308	void FFTMulTabsDeleterPolicy::deleter(FFTMulTabs *p) { delete p; }
	309
282	310
283	311
284	312	FFTTablesType FFTTables;

428	456	}
429	457
430	458
431		void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, bool bigtab)
432		{
433		mulmod_t qinv = PrepMulMod(q);
434
435		long mr = CalcMaxRoot(q);
436
437		info.q = q;
438		info.qinv = qinv;
439		info.qrecip = 1/double(q);
440		info.zz_p_context = 0;
441
442
443		info.RootTable[0].SetLength(mr+1);
444		info.RootTable[1].SetLength(mr+1);
445		info.TwoInvTable.SetLength(mr+1);
446		info.TwoInvPreconTable.SetLength(mr+1);
447
448		long *rt = &info.RootTable[0][0];
449		long *rit = &info.RootTable[1][0];
450		long *tit = &info.TwoInvTable[0];
451		mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
452
453		long j;
454		long t;
455
456		rt[mr] = w;
457		for (j = mr-1; j >= 0; j--)
458		rt[j] = MulMod(rt[j+1], rt[j+1], q);
459
460		rit[mr] = InvMod(w, q);
461		for (j = mr-1; j >= 0; j--)
462		rit[j] = MulMod(rit[j+1], rit[j+1], q);
463
464		t = InvMod(2, q);
465		tit[0] = 1;
466		for (j = 1; j <= mr; j++)
467		tit[j] = MulMod(tit[j-1], t, q);
468
469		for (j = 0; j <= mr; j++)
470		tipt[j] = PrepMulModPrecon(tit[j], q, qinv);
471
472		if (bigtab)
473		info.bigtab.make();
474		}
475	459
476	460
477	461	#ifndef NTL_WIZARD_HACK

504	488	long q, w;
505	489	NextFFTPrime(q, w, i);
506	490
507		bool bigtab = false;
	491	long bigtab_index = -1;
508	492
509	493	#ifdef NTL_FFT_BIGTAB
510		if (i < NTL_FFT_BIGTAB_LIMIT)
511		bigtab = true;
	494	bigtab_index = i;
512	495	#endif
513	496
514		InitFFTPrimeInfo(*info, q, w, bigtab);
	497	InitFFTPrimeInfo(*info, q, w, bigtab_index);
515	498	info->zz_p_context = Build_zz_pInfo(info.get());
516	499	bld.move(info);
517	500	}
518	501
519	502	} while (0);
520	503	}
521
522
523
524
525
526		#define NTL_PIPELINE
527		// Define to gets some software pipelining...actually seems
528		// to help somewhat
529
530		#define NTL_LOOP_UNROLL
531		// Define to unroll some loops. Seems to help a little
532
533		// FIXME: maybe the above two should be tested by the wizard
534
535
536		static
537		long RevInc(long a, long k)
538		{
539		long j, m;
540
541		j = k;
542		m = 1L << (k-1);
543
544		while (j && (m & a)) {
545		a ^= m;
546		m >>= 1;
547		j--;
548		}
549		if (j) a ^= m;
550		return a;
551		}
552
553
554		// FIXME: This could potentially be shared across threads, using
555		// a "lazy table".
556		static inline
557		Vec<long> *get_brc_mem()
558		{
559		NTL_TLS_LOCAL_INIT(Vec< Vec<long> >, brc_mem_vec, (INIT_SIZE, NTL_FFTMaxRoot+1));
560		return brc_mem_vec.elts();
561		}
562
563
564
565		#if 0
566
567
568		static
569		void BitReverseCopy(long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
570		{
571		Vec<long> *brc_mem = get_brc_mem();
572
573		long n = 1L << k;
574		long* NTL_RESTRICT rev;
575		long i, j;
576
577		rev = brc_mem[k].elts();
578		if (!rev) {
579		brc_mem[k].SetLength(n);
580		rev = brc_mem[k].elts();
581		for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
582		rev[i] = j;
583		}
584
585		for (i = 0; i < n; i++)
586		A[rev[i]] = a[i];
587		}
588
589
590		static
591		void BitReverseCopy(unsigned long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
592		{
593		Vec<long> *brc_mem = get_brc_mem();
594
595		long n = 1L << k;
596		long* NTL_RESTRICT rev;
597		long i, j;
598
599		rev = brc_mem[k].elts();
600		if (!rev) {
601		brc_mem[k].SetLength(n);
602		rev = brc_mem[k].elts();
603		for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
604		rev[i] = j;
605		}
606
607		for (i = 0; i < n; i++)
608		A[rev[i]] = a[i];
609		}
610
611		#else
612
613
614
615		#define NTL_BRC_THRESH (11)
616		#define NTL_BRC_Q (5)
617
618		// Must have NTL_BRC_THRESH >= 2*NTL_BRC_Q
619		// Should also have (1L << (2*NTL_BRC_Q)) small enough
620		// so that we can fit that many long's into the cache
621
622
623		static
624		long *BRC_init(long k)
625		{
626		Vec<long> *brc_mem = get_brc_mem();
627
628		long n = (1L << k);
629		brc_mem[k].SetLength(n);
630		long *rev = brc_mem[k].elts();
631		long i, j;
632		for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
633		rev[i] = j;
634		return rev;
635		}
636
637
638		static
639		void BasicBitReverseCopy(long * NTL_RESTRICT B,
640		const long * NTL_RESTRICT A, long k)
641		{
642		Vec<long> *brc_mem = get_brc_mem();
643
644		long n = 1L << k;
645		long* NTL_RESTRICT rev;
646		long i, j;
647
648		rev = brc_mem[k].elts();
649		if (!rev) rev = BRC_init(k);
650
651		for (i = 0; i < n; i++)
652		B[rev[i]] = A[i];
653		}
654
655
656
657		static
658		void COBRA(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
659		{
660		Vec<long> *brc_mem = get_brc_mem();
661
662		NTL_TLS_LOCAL(Vec<long>, BRC_temp);
663
664		long q = NTL_BRC_Q;
665		long k1 = k - 2*q;
666		long * NTL_RESTRICT rev_k1, * NTL_RESTRICT rev_q;
667		long *NTL_RESTRICT T;
668		long a, b, c, a1, b1, c1;
669		long i, j;
670
671		rev_k1 = brc_mem[k1].elts();
672		if (!rev_k1) rev_k1 = BRC_init(k1);
673
674		rev_q = brc_mem[q].elts();
675		if (!rev_q) rev_q = BRC_init(q);
676
677		T = BRC_temp.elts();
678		if (!T) {
679		BRC_temp.SetLength(1L << (2*q));
680		T = BRC_temp.elts();
681		}
682
683		for (b = 0; b < (1L << k1); b++) {
684		b1 = rev_k1[b];
685		for (a = 0; a < (1L << q); a++) {
686		a1 = rev_q[a];
687		for (c = 0; c < (1L << q); c++)
688		T[(a1 << q) + c] = A[(a << (k1+q)) + (b << q) + c];
689		}
690
691		for (c = 0; c < (1L << q); c++) {
692		c1 = rev_q[c];
693		for (a1 = 0; a1 < (1L << q); a1++)
694		B[(c1 << (k1+q)) + (b1 << q) + a1] = T[(a1 << q) + c];
695		}
696		}
697		}
698
699		static
700		void BitReverseCopy(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
701		{
702		if (k <= NTL_BRC_THRESH)
703		BasicBitReverseCopy(B, A, k);
704		else
705		COBRA(B, A, k);
706		}
707
708
709		static
710		void BasicBitReverseCopy(unsigned long * NTL_RESTRICT B,
711		const long * NTL_RESTRICT A, long k)
712		{
713		Vec<long> *brc_mem = get_brc_mem();
714
715		long n = 1L << k;
716		long* NTL_RESTRICT rev;
717		long i, j;
718
719		rev = brc_mem[k].elts();
720		if (!rev) rev = BRC_init(k);
721
722		for (i = 0; i < n; i++)
723		B[rev[i]] = A[i];
724		}
725
726
727
728		static
729		void COBRA(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
730		{
731		Vec<long> *brc_mem = get_brc_mem();
732
733		NTL_TLS_LOCAL(Vec<unsigned long>, BRC_temp);
734
735		long q = NTL_BRC_Q;
736		long k1 = k - 2*q;
737		long * NTL_RESTRICT rev_k1, * NTL_RESTRICT rev_q;
738		unsigned long *NTL_RESTRICT T;
739		long a, b, c, a1, b1, c1;
740		long i, j;
741
742		rev_k1 = brc_mem[k1].elts();
743		if (!rev_k1) rev_k1 = BRC_init(k1);
744
745		rev_q = brc_mem[q].elts();
746		if (!rev_q) rev_q = BRC_init(q);
747
748		T = BRC_temp.elts();
749		if (!T) {
750		BRC_temp.SetLength(1L << (2*q));
751		T = BRC_temp.elts();
752		}
753
754		for (b = 0; b < (1L << k1); b++) {
755		b1 = rev_k1[b];
756		for (a = 0; a < (1L << q); a++) {
757		a1 = rev_q[a];
758		for (c = 0; c < (1L << q); c++)
759		T[(a1 << q) + c] = A[(a << (k1+q)) + (b << q) + c];
760		}
761
762		for (c = 0; c < (1L << q); c++) {
763		c1 = rev_q[c];
764		for (a1 = 0; a1 < (1L << q); a1++)
765		B[(c1 << (k1+q)) + (b1 << q) + a1] = T[(a1 << q) + c];
766		}
767		}
768		}
769
770		static
771		void BitReverseCopy(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
772		{
773		if (k <= NTL_BRC_THRESH)
774		BasicBitReverseCopy(B, A, k);
775		else
776		COBRA(B, A, k);
777		}
778
779
780
781
782		#endif
783	504
784	505
785	506	#ifdef NTL_FFT_LAZYMUL

798	519
799	520	#endif
800	521
801		#ifndef NTL_FFT_BIGTAB
802
803		#define NTL_FFT_ROUTINE_TAB FFT_aux
804		#define NTL_FFT_ROUTINE_NOTAB FFT
805
806		#else
807
808		#define NTL_FFT_ROUTINE_TAB FFT
809		#define NTL_FFT_ROUTINE_NOTAB FFT_aux
810
	522
	523
	524
	525	#ifdef NTL_FFT_LAZYMUL
	526	// FFT with lazy multiplication
	527
	528	#ifdef NTL_CLEAN_INT
	529	#define NTL_FFT_USEBUF
811	530	#endif
812
813
814
815
816
817
818		#ifndef NTL_FFT_LAZYMUL
819
820
821		// A basic FFT, no tables, no lazy strategy
822
823		void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
824		// performs a 2^k-point convolution modulo q
825
826		{
827		long q = info.q;
828		const long *root = info.RootTable[dir].elts();
829		mulmod_t qinv = info.qinv;
830
831		if (k <= 1) {
832		if (k == 0) {
833		A[0] = a[0];
834		return;
835		}
836		if (k == 1) {
837		long a0 = AddMod(a[0], a[1], q);
838		long a1 = SubMod(a[0], a[1], q);
839		A[0] = a0;
840		A[1] = a1;
841		return;
842		}
843		}
844
845		// assume k > 1
846
847		NTL_TLS_LOCAL(Vec<long>, wtab_store);
848		NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
849		NTL_TLS_LOCAL(Vec<long>, AA_store);
850
851		wtab_store.SetLength(1L << (k-2));
852		wqinvtab_store.SetLength(1L << (k-2));
853		AA_store.SetLength(1L << k);
854
855		long * NTL_RESTRICT wtab = wtab_store.elts();
856		mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();
857		long *AA = AA_store.elts();
858
859		wtab[0] = 1;
860		wqinvtab[0] = PrepMulModPrecon(1, q, qinv);
861
862
863		BitReverseCopy(AA, a, k);
864
865		long n = 1L << k;
866
867		long s, m, m_half, m_fourth, i, j, t, u, t1, u1, tt, tt1;
868
869		long w;
870		mulmod_precon_t wqinv;
871
872		// s = 1
873
874		for (i = 0; i < n; i += 2) {
875		t = AA[i + 1];
876		u = AA[i];
877		AA[i] = AddMod(u, t, q);
878		AA[i+1] = SubMod(u, t, q);
879		}
880
881
882
883		for (s = 2; s < k; s++) {
884		m = 1L << s;
885		m_half = 1L << (s-1);
886		m_fourth = 1L << (s-2);
887
888		w = root[s];
889		wqinv = PrepMulModPrecon(w, q, qinv);
890
891		// prepare wtab...
892
893		#if 1
894		// plain version...
895
896		for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
897		long w_j = wtab[j];
898		mulmod_precon_t wqi_j = wqinvtab[j];
899		long w_i = MulModPrecon(w_j, w, q, wqinv);
900		mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
901
902		wtab[i-1] = w_j;
903		wqinvtab[i-1] = wqi_j;
904		wtab[i] = w_i;
905		wqinvtab[i] = wqi_i;
906		}
907		#else
908		// software pipeline version...doesn't seem to make a big difference
909
910		if (s == 2) {
911		wtab[1] = MulModPrecon(wtab[0], w, q, wqinv);
912		wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
913		}
914		else {
915		i = m_half-1; j = m_fourth-1;
916		wtab[i-1] = wtab[j];
917		wqinvtab[i-1] = wqinvtab[j];
918		wtab[i] = MulModPrecon(wtab[i-1], w, q, wqinv);
919
920		i -= 2; j --;
921
922		for (; i >= 0; i -= 2, j --) {
923		long wp2 = wtab[i+2];
924		long wm1 = wtab[j];
925		wqinvtab[i+2] = PrepMulModPrecon(wp2, q, qinv);
926		wtab[i-1] = wm1;
927		wqinvtab[i-1] = wqinvtab[j];
928		wtab[i] = MulModPrecon(wm1, w, q, wqinv);
929		}
930
931		wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
932		}
933
934
935		#endif
936
937
938		for (i = 0; i < n; i+= m) {
939
940		long * NTL_RESTRICT AA0 = &AA[i];
941		long * NTL_RESTRICT AA1 = &AA[i + m_half];
942
943
944
945		#if 1
946		// loop unrolling and pipelining
947
948		t = AA1[0];
949		u = AA0[0];
950		t1 = MulModPrecon(AA1[1], w, q, wqinv);
951		u1 = AA0[1];
952
953
954
955		for (j = 0; j < m_half-2; j += 2) {
956		long a02 = AA0[j+2];
957		long a03 = AA0[j+3];
958		long a12 = AA1[j+2];
959		long a13 = AA1[j+3];
960		long w2 = wtab[j+2];
961		long w3 = wtab[j+3];
962		mulmod_precon_t wqi2 = wqinvtab[j+2];
963		mulmod_precon_t wqi3 = wqinvtab[j+3];
964
965		tt = MulModPrecon(a12, w2, q, wqi2);
966		long b00 = AddMod(u, t, q);
967		long b10 = SubMod(u, t, q);
968		t = tt;
969		u = a02;
970
971		tt1 = MulModPrecon(a13, w3, q, wqi3);
972		long b01 = AddMod(u1, t1, q);
973		long b11 = SubMod(u1, t1, q);
974		t1 = tt1;
975		u1 = a03;
976
977		AA0[j] = b00;
978		AA1[j] = b10;
979		AA0[j+1] = b01;
980		AA1[j+1] = b11;
981		}
982
983
984		AA0[j] = AddMod(u, t, q);
985		AA1[j] = SubMod(u, t, q);
986		AA0[j + 1] = AddMod(u1, t1, q);
987		AA1[j + 1] = SubMod(u1, t1, q);
988
989
990		#else
991		// no loop unrolling, but still some pipelining
992
993
994		t = AA1[0];
995		u = AA0[0];
996
997		for (j = 0; j < m_half-1; j++) {
998		long a02 = AA0[j+1];
999		long a12 = AA1[j+1];
1000		long w2 = wtab[j+1];
1001		mulmod_precon_t wqi2 = wqinvtab[j+1];
1002
1003		tt = MulModPrecon(a12, w2, q, wqi2);
1004		long b00 = AddMod(u, t, q);
1005		long b10 = SubMod(u, t, q);
1006		t = tt;
1007		u = a02;
1008
1009		AA0[j] = b00;
1010		AA1[j] = b10;
1011		}
1012
1013
1014		AA0[j] = AddMod(u, t, q);
1015		AA1[j] = SubMod(u, t, q);
1016
1017
1018		#endif
1019		}
1020		}
1021
1022
1023		// s == k...special case
1024
1025		m = 1L << s;
1026		m_half = 1L << (s-1);
1027		m_fourth = 1L << (s-2);
1028
1029
1030		w = root[s];
1031		wqinv = PrepMulModPrecon(w, q, qinv);
1032
1033		// j = 0, 1
1034
1035		t = AA[m_half];
1036		u = AA[0];
1037		t1 = MulModPrecon(AA[1+ m_half], w, q, wqinv);
1038		u1 = AA[1];
1039
1040		A[0] = AddMod(u, t, q);
1041		A[m_half] = SubMod(u, t, q);
1042		A[1] = AddMod(u1, t1, q);
1043		A[1 + m_half] = SubMod(u1, t1, q);
1044
1045		for (j = 2; j < m_half; j += 2) {
1046		t = MulModPrecon(AA[j + m_half], wtab[j >> 1], q, wqinvtab[j >> 1]);
1047		u = AA[j];
1048		t1 = MulModPrecon(AA[j + 1+ m_half], wtab[j >> 1], q,
1049		wqinvtab[j >> 1]);
1050		t1 = MulModPrecon(t1, w, q, wqinv);
1051		u1 = AA[j + 1];
1052
1053		A[j] = AddMod(u, t, q);
1054		A[j + m_half] = SubMod(u, t, q);
1055		A[j + 1] = AddMod(u1, t1, q);
1056		A[j + 1 + m_half] = SubMod(u1, t1, q);
1057
1058		}
1059		}
1060
1061
1062
1063
1064
1065
1066
1067		#else
1068
1069
1070
1071		// FFT with lazy multiplication
	531	// DIRT: with the lazy multiplication strategy, we have to work
	532	// with unisgned long's rather than long's. To avoid unnecessary
	533	// copying, we simply cast long* to unsigned long*.
	534	// Is this standards compliant? Does it evoke Undefined Behavior?
	535	// The C++ standard before C++14 were actually somewhat inconsistent
	536	// on this point.
	537
	538	// In all versions of the C++ and C standards, the "strict aliasing"
	539	// rules [basic.lval] have always said that signed/unsigned can
	540	// always alias each other. So this does not break the strict
	541	// aliasing rules. However, prior to C++14, the section
	542	// on Lvalue-to-rvalue conversion [conv.lval] said that
	543	// this was actually UB. This has been cleared up in C++14,
	544	// where now it is no longer UB. Actally, it seems that the change
	545	// to C++14 was cleaning up an inconsistency in the standard
	546	// itself, and not really a change in the language definition.
	547
	548	// In practice, it does make a significant difference in performance
	549	// to avoid all these copies, so the default is avoid them.
	550
	551	// See: https://stackoverflow.com/questions/30048135/efficient-way-to-bit-copy-a-signed-integer-to-an-unsigned-integer
	552
	553	// See: https://stackoverflow.com/questions/27109701/aliasing-of-otherwise-equivalent-signed-and-unsigned-types
	554	// Especially comments by Columbo regarding N3797 and [conv.lval]
	555
	556
	557
	558
	559
1072	560
1073	561	#if (defined(NTL_LONGLONG_SP_MULMOD))
1074	562

1278	766	}
1279	767
1280	768
	769	typedef long mint_t;
	770	typedef unsigned long umint_t;
	771	// For readability and to make it easier to adapt this
	772	// code to other settings
	773
1281	774	static inline
1282		unsigned long LazyReduce1(unsigned long a, long q)
1283		{
1284		return sp_CorrectExcess(long(a), q);
	775	umint_t LazyReduce1(umint_t a, mint_t q)
	776	{
	777	return sp_CorrectExcess(mint_t(a), q);
1285	778	}
1286	779
1287	780	static inline
1288		unsigned long LazyReduce2(unsigned long a, long q)
	781	umint_t LazyReduce2(umint_t a, mint_t q)
1289	782	{
1290	783	return sp_CorrectExcess(a, 2*q);
1291	784	}
1292	785
1293	786
1294
1295
1296		// FFT: Lazy, no tables
1297
1298		void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1299
1300		// performs a 2^k-point convolution modulo q
1301
1302		{
1303		long q = info.q;
1304		const long *root = info.RootTable[dir].elts();
1305		mulmod_t qinv = info.qinv;
1306
1307		if (k <= 1) {
1308		if (k == 0) {
1309		A[0] = a[0];
1310		return;
1311		}
1312		if (k == 1) {
1313		long a0 = AddMod(a[0], a[1], q);
1314		long a1 = SubMod(a[0], a[1], q);
1315		A[0] = a0;
1316		A[1] = a1;
1317		return;
1318		}
1319		}
1320
1321		// assume k >= 2
1322
1323		NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
1324		AA_store.SetLength(1L << k);
1325		unsigned long *AA = AA_store.elts();
1326
1327		NTL_TLS_LOCAL(Vec<long>, wtab_store);
1328		wtab_store.SetLength(max(2, 1L << (k-2)));
1329		// allocate space for at least 2 elements, to deal with a corner case when k == 2
1330		long * NTL_RESTRICT wtab = wtab_store.elts();
1331
1332		NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
1333		wqinvtab_store.SetLength(max(2, 1L << (k-2)));
1334		// allocate space for at least 2 elements, to deal with a corner case when k == 2
1335		mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();
1336
1337
1338		BitReverseCopy(AA, a, k);
1339
1340		long n = 1L << k;
1341
1342
1343		/* we work with redundant representations, in the range [0, 4q) */
1344
1345		long s, m, m_half, m_fourth, i, j;
1346		unsigned long t, u, t1, u1;
1347
1348
1349		wtab[0] = 1;
1350		wqinvtab[0] = LazyPrepMulModPrecon(1, q, qinv);
1351
1352		// s = 1
1353		for (i = 0; i < n; i += 2) {
1354		t = AA[i + 1];
1355		u = AA[i];
1356		AA[i] = u + t;
1357		AA[i+1] = u - t + q;
1358		}
1359
1360		// s = 2
1361		{
1362		long w = root[2];
1363		mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1364
1365		wtab[1] = w;
1366		wqinvtab[1] = wqinv;
1367
1368
1369		for (i = 0; i < n; i += 4) {
1370
1371		unsigned long * NTL_RESTRICT AA0 = &AA[i];
1372		unsigned long * NTL_RESTRICT AA1 = &AA[i + 2];
1373
1374		{
1375		const unsigned long a11 = AA1[0];
1376		const unsigned long a01 = AA0[0];
1377
1378		const unsigned long tt1 = a11;
1379		const unsigned long uu1 = a01;
1380		const unsigned long b01 = uu1 + tt1;
1381		const unsigned long b11 = uu1 - tt1 + 2*q;
1382
1383		AA0[0] = b01;
1384		AA1[0] = b11;
1385		}
1386		{
1387		const unsigned long a11 = AA1[1];
1388		const unsigned long a01 = AA0[1];
1389
1390		const unsigned long tt1 = LazyMulModPrecon(a11, w, q, wqinv);
1391		const unsigned long uu1 = a01;
1392		const unsigned long b01 = uu1 + tt1;
1393		const unsigned long b11 = uu1 - tt1 + 2*q;
1394
1395		AA0[1] = b01;
1396		AA1[1] = b11;
1397		}
1398		}
1399		}
1400
1401
1402		// s = 3..k-1
1403
1404		for (s = 3; s < k; s++) {
1405		m = 1L << s;
1406		m_half = 1L << (s-1);
1407		m_fourth = 1L << (s-2);
1408
1409		long w = root[s];
	787	// inputs in [0, 2n), output in [0, 4n)
	788	static inline
	789	umint_t LazyAddMod(umint_t a, umint_t b, mint_t n)
	790	{
	791	return a+b;
	792	}
	793
	794	// inputs in [0, 2n), output in [0, 4n)
	795	static inline
	796	umint_t LazySubMod(umint_t a, umint_t b, mint_t n)
	797	{
	798	return a-b+2*n;
	799	}
	800
	801	// inputs in [0, 2n), output in [0, 2n)
	802	static inline
	803	umint_t LazyAddMod2(umint_t a, umint_t b, mint_t n)
	804	{
	805	umint_t r = a+b;
	806	return sp_CorrectExcess(r, 2*n);
	807	}
	808
	809	// inputs in [0, 2n), output in [0, 2n)
	810	static inline
	811	umint_t LazySubMod2(umint_t a, umint_t b, mint_t n)
	812	{
	813	umint_t r = a-b;
	814	return sp_CorrectDeficit(r, 2*n);
	815	}
	816
	817	#ifdef NTL_AVOID_BRANCHING
	818
	819	// x, y in [0, 4*m)
	820	// returns x + y mod 4m, in [0, 4m)
	821	inline static umint_t
	822	LazyAddMod4(umint_t x, umint_t y, mint_t m)
	823	{
	824	x = LazyReduce2(x, m);
	825	y = LazyReduce2(y, m);
	826	return x+y;
	827	}
	828
	829	// x, y in [0, 4*m)
	830	// returns x - y mod 4m, in [0, 4m)
	831	inline static umint_t
	832	LazySubMod4(umint_t x, umint_t y, mint_t m)
	833	{
	834	x = LazyReduce2(x, m);
	835	y = LazyReduce2(y, m);
	836	return x-y+2*m;
	837	}
	838
	839	#else
	840
	841	static inline umint_t
	842	LazyAddMod4(umint_t x, umint_t y, umint_t m)
	843	{
	844	y = 4*m - y;
	845	umint_t z = x - y;
	846	z += (x < y) ? 4*m : 0;
	847	return z;
	848	}
	849
	850
	851	static inline umint_t
	852	LazySubMod4(umint_t x, umint_t y, umint_t m)
	853	{
	854	umint_t z = x - y;
	855	z += (x < y) ? 4*m : 0;
	856	return z;
	857	}
	858
	859	#endif
	860
	861	// Input and output in [0, 4*n)
	862	static inline umint_t
	863	LazyDoubleMod4(umint_t a, mint_t n)
	864	{
	865	return 2 * LazyReduce2(a, n);
	866	}
	867
	868	// Input and output in [0, 2*n)
	869	static inline umint_t
	870	LazyDoubleMod2(umint_t a, mint_t n)
	871	{
	872	return 2 * LazyReduce1(a, n);
	873	}
	874
	875	void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
	876	{
	877
	878	long old_len = v.length();
	879	v.SetLength(k+1);
	880
	881	for (long s = max(old_len, 1); s <= k; s++) {
	882	v[s].wtab_precomp.SetLength(1L << (s-1));
	883	v[s].wqinvtab_precomp.SetLength(1L << (s-1));
	884	}
	885
	886	if (k >= 1) {
	887	v[1].wtab_precomp[0] = 1;
	888	v[1].wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
	889	}
	890
	891	if (k >= 2) {
	892	v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
	893	v[2].wtab_precomp[1] = root[2];
	894	v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
	895	v[2].wqinvtab_precomp[1] = LazyPrepMulModPrecon(root[2], q, qinv);
	896	}
	897
	898	for (long s = 3; s <= k; s++) {
	899	long m = 1L << s;
	900	long m_half = 1L << (s-1);
	901	long m_fourth = 1L << (s-2);
	902	mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
	903	mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
	904	mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
	905	mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
	906
	907	mint_t w = root[s];
	908	umint_t wqinv_rem;
	909	mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
	910
	911
	912	for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
	913	mint_t w_j = wtab1[j];
	914	mulmod_precon_t wqi_j = wqinvtab1[j];
1410	915
1411	916	#if 0
1412		// This computes all the multipliers in a straightforward fashion.
1413		// It's a bit slower that the strategy used below, even if
1414		// NTL_LONGLONG_SP_MULMOD is set
1415
1416		mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1417
1418
1419		for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1420		long w_j = wtab[j];
1421		mulmod_precon_t wqi_j = wqinvtab[j];
1422
1423		long w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
1424		mulmod_precon_t wqi_i = LazyPrepMulModPrecon(w_i, q, qinv);
	917	mint_t w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
	918	mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
	919	+ cast_unsigned(w_j)*wqinv;
	920	#else
	921	// This code sequence makes sure the compiler sees
	922	// that the product w_j*wqinv needs to be computed just once
	923	ll_type x;
	924	ll_mul(x, w_j, wqinv);
	925	umint_t hi = ll_get_hi(x);
	926	umint_t lo = ll_get_lo(x);
	927	umint_t r = cast_unsigned(w_j)cast_unsigned(w) - hicast_unsigned(q);
	928
	929	mint_t w_i = LazyReduce1(r, q);
	930	mulmod_precon_t wqi_i = lo+LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j);
	931	#endif
1425	932
1426	933	wtab[i-1] = w_j;
1427	934	wqinvtab[i-1] = wqi_j;
1428	935	wtab[i] = w_i;
1429	936	wqinvtab[i] = wqi_i;
1430	937	}
	938	}
	939
	940	#if 0
	941	// verify result
	942	for (long s = 1; s <= k; s++) {
	943	mint_t *wtab = v[s].wtab_precomp.elts();
	944	mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
	945	long m_half = 1L << (s-1);
	946
	947	mint_t w = root[s];
	948	mint_t w_i = 1;
	949	for (long i = 0; i < m_half; i++) {
	950	if (wtab[i] != w_i \|\| wqinvtab[i] != LazyPrepMulModPrecon(w_i, q, qinv))
	951	Error("bad table entry");
	952	w_i = MulMod(w_i, w, q, qinv);
	953	}
	954	}
	955	#endif
	956	}
	957
	958
1431	959	#else
1432		unsigned long wqinv_rem;
1433		mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
1434
1435
1436		for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1437		long w_j = wtab[j];
1438		mulmod_precon_t wqi_j = wqinvtab[j];
1439
1440		// The next two lines are equivalent, but the first involves
1441		// a computation of hi(w_j*wqinv), which pairs with the
1442		// computation of lo(w_j*wqinv) below...but I don't think
1443		// the compiler sees this...oh well...
1444
1445		long w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
1446		// long w_i = LazyReduce1(LazyMulModPrecon(w, w_j, q, wqi_j), q);
1447
1448		mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
1449		+ cast_unsigned(w_j)*wqinv;
	960
	961
	962	// Hacks to make the LAZY code work with ordinary modular arithmetic
	963
	964	typedef long mint_t;
	965	typedef long umint_t;
	966
	967	static inline mint_t IdentityMod(mint_t a, mint_t q) { return a; }
	968	static inline mint_t DoubleMod(mint_t a, mint_t q) { return AddMod(a, a, q); }
	969
	970	#define LazyPrepMulModPrecon PrepMulModPrecon
	971	#define LazyMulModPrecon MulModPrecon
	972
	973	#define LazyReduce1 IdentityMod
	974	#define LazyReduce2 IdentityMod
	975	#define LazyAddMod AddMod
	976	#define LazySubMod SubMod
	977	#define LazyAddMod2 AddMod
	978	#define LazySubMod2 SubMod
	979	#define LazyAddMod4 AddMod
	980	#define LazySubMod4 SubMod
	981	#define LazyDoubleMod2 DoubleMod
	982	#define LazyDoubleMod4 DoubleMod
	983
	984
	985	void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
	986	{
	987
	988	long old_len = v.length();
	989	v.SetLength(k+1);
	990
	991	for (long s = max(old_len, 1); s <= k; s++) {
	992	v[s].wtab_precomp.SetLength(1L << (s-1));
	993	v[s].wqinvtab_precomp.SetLength(1L << (s-1));
	994	}
	995
	996	if (k >= 1) {
	997	v[1].wtab_precomp[0] = 1;
	998	v[1].wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
	999	}
	1000
	1001	if (k >= 2) {
	1002	v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
	1003	v[2].wtab_precomp[1] = root[2];
	1004	v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
	1005	v[2].wqinvtab_precomp[1] = PrepMulModPrecon(root[2], q, qinv);
	1006	}
	1007
	1008	for (long s = 3; s <= k; s++) {
	1009	long m = 1L << s;
	1010	long m_half = 1L << (s-1);
	1011	long m_fourth = 1L << (s-2);
	1012	mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
	1013	mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
	1014	mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
	1015	mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
	1016
	1017	mint_t w = root[s];
	1018	mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
	1019
	1020
	1021	for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
	1022	mint_t w_j = wtab1[j];
	1023	mulmod_precon_t wqi_j = wqinvtab1[j];
	1024
	1025	mint_t w_i = MulModPrecon(w_j, w, q, wqinv);
	1026	mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
1450	1027
1451	1028	wtab[i-1] = w_j;
1452	1029	wqinvtab[i-1] = wqi_j;
1453	1030	wtab[i] = w_i;
1454	1031	wqinvtab[i] = wqi_i;
1455	1032	}
1456
1457
	1033	}
	1034
	1035	#if 0
	1036	// verify result
	1037	for (long s = 1; s <= k; s++) {
	1038	mint_t *wtab = v[s].wtab_precomp.elts();
	1039	mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
	1040	long m_half = 1L << (s-1);
	1041
	1042	mint_t w = root[s];
	1043	mint_t w_i = 1;
	1044	for (long i = 0; i < m_half; i++) {
	1045	if (wtab[i] != w_i \|\| wqinvtab[i] != PrepMulModPrecon(w_i, q, qinv))
	1046	Error("bad table entry");
	1047	w_i = MulMod(w_i, w, q, qinv);
	1048	}
	1049	}
1458	1050	#endif
1459
1460		for (i = 0; i < n; i += m) {
1461
1462		unsigned long * NTL_RESTRICT AA0 = &AA[i];
1463		unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
1464
1465
1466		for (j = 0; j < m_half; j += 4) {
1467		{
1468		const long w1 = wtab[j+0];
1469		const mulmod_precon_t wqi1 = wqinvtab[j+0];
1470		const unsigned long a11 = AA1[j+0];
1471		const unsigned long a01 = AA0[j+0];
1472
1473		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1474		const unsigned long uu1 = LazyReduce2(a01, q);
1475		const unsigned long b01 = uu1 + tt1;
1476		const unsigned long b11 = uu1 - tt1 + 2*q;
1477
1478		AA0[j+0] = b01;
1479		AA1[j+0] = b11;
1480		}
1481		{
1482		const long w1 = wtab[j+1];
1483		const mulmod_precon_t wqi1 = wqinvtab[j+1];
1484		const unsigned long a11 = AA1[j+1];
1485		const unsigned long a01 = AA0[j+1];
1486
1487		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1488		const unsigned long uu1 = LazyReduce2(a01, q);
1489		const unsigned long b01 = uu1 + tt1;
1490		const unsigned long b11 = uu1 - tt1 + 2*q;
1491
1492		AA0[j+1] = b01;
1493		AA1[j+1] = b11;
1494		}
1495		{
1496		const long w1 = wtab[j+2];
1497		const mulmod_precon_t wqi1 = wqinvtab[j+2];
1498		const unsigned long a11 = AA1[j+2];
1499		const unsigned long a01 = AA0[j+2];
1500
1501		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1502		const unsigned long uu1 = LazyReduce2(a01, q);
1503		const unsigned long b01 = uu1 + tt1;
1504		const unsigned long b11 = uu1 - tt1 + 2*q;
1505
1506		AA0[j+2] = b01;
1507		AA1[j+2] = b11;
1508		}
1509		{
1510		const long w1 = wtab[j+3];
1511		const mulmod_precon_t wqi1 = wqinvtab[j+3];
1512		const unsigned long a11 = AA1[j+3];
1513		const unsigned long a01 = AA0[j+3];
1514
1515		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1516		const unsigned long uu1 = LazyReduce2(a01, q);
1517		const unsigned long b01 = uu1 + tt1;
1518		const unsigned long b11 = uu1 - tt1 + 2*q;
1519
1520		AA0[j+3] = b01;
1521		AA1[j+3] = b11;
1522		}
1523		}
1524		}
1525		}
1526
1527
1528
1529		// special case: s == k to avoid extraneous computation of constants
1530
1531		if (k > 2) {
1532		s = k;
1533
1534		m = 1L << s;
1535		m_half = 1L << (s-1);
1536		m_fourth = 1L << (s-2);
1537
1538		long w = root[s];
1539		mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1540
1541
1542		for (i = 0; i < n; i += m) {
1543
1544		unsigned long * NTL_RESTRICT AA0 = &AA[i];
1545		unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
1546
1547		long half_j;
1548
1549		for (j = 0, half_j = 0; j < m_half; j += 4, half_j += 2) {
1550		{
1551		const long w1 = wtab[half_j+0];
1552		const mulmod_precon_t wqi1 = wqinvtab[half_j+0];
1553		const unsigned long a11 = AA1[j+0];
1554		const unsigned long a01 = AA0[j+0];
1555
1556		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1557		const unsigned long uu1 = LazyReduce2(a01, q);
1558		const unsigned long b01 = uu1 + tt1;
1559		const unsigned long b11 = uu1 - tt1 + 2*q;
1560
1561		AA0[j+0] = b01;
1562		AA1[j+0] = b11;
1563		}
1564		{
1565		const long w1 = wtab[half_j+0];
1566		const mulmod_precon_t wqi1 = wqinvtab[half_j+0];
1567		const unsigned long a11 = AA1[j+1];
1568		const unsigned long a01 = AA0[j+1];
1569
1570		const unsigned long tt1 = LazyMulModPrecon(LazyMulModPrecon(a11, w1, q, wqi1),
1571		w, q, wqinv);
1572		const unsigned long uu1 = LazyReduce2(a01, q);
1573		const unsigned long b01 = uu1 + tt1;
1574		const unsigned long b11 = uu1 - tt1 + 2*q;
1575
1576		AA0[j+1] = b01;
1577		AA1[j+1] = b11;
1578		}
1579		{
1580		const long w1 = wtab[half_j+1];
1581		const mulmod_precon_t wqi1 = wqinvtab[half_j+1];
1582		const unsigned long a11 = AA1[j+2];
1583		const unsigned long a01 = AA0[j+2];
1584
1585		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1586		const unsigned long uu1 = LazyReduce2(a01, q);
1587		const unsigned long b01 = uu1 + tt1;
1588		const unsigned long b11 = uu1 - tt1 + 2*q;
1589
1590		AA0[j+2] = b01;
1591		AA1[j+2] = b11;
1592		}
1593		{
1594		const long w1 = wtab[half_j+1];
1595		const mulmod_precon_t wqi1 = wqinvtab[half_j+1];
1596		const unsigned long a11 = AA1[j+3];
1597		const unsigned long a01 = AA0[j+3];
1598
1599		const unsigned long tt1 = LazyMulModPrecon(LazyMulModPrecon(a11, w1, q, wqi1),
1600		w, q, wqinv);
1601		const unsigned long uu1 = LazyReduce2(a01, q);
1602		const unsigned long b01 = uu1 + tt1;
1603		const unsigned long b11 = uu1 - tt1 + 2*q;
1604
1605		AA0[j+3] = b01;
1606		AA1[j+3] = b11;
1607		}
1608		}
1609		}
1610		}
1611
1612
1613		/* need to reduce redundant representations */
1614
1615		for (i = 0; i < n; i++) {
1616		unsigned long tmp = LazyReduce2(AA[i], q);
1617		A[i] = LazyReduce1(tmp, q);
1618		}
1619		}
1620
	1051	}
1621	1052
1622	1053	#endif
1623	1054
1624	1055
1625	1056
1626
1627
1628
1629
1630		#ifndef NTL_FFT_LAZYMUL
1631
1632		// FFT with precomputed tables, no lazy mul
1633
1634	1057	static
1635		void PrecompFFTMultipliers(long k, long q, mulmod_t qinv, const long *root, const FFTMultipliers& tab)
1636		{
1637		if (k < 1) LogicError("PrecompFFTMultipliers: bad input");
1638
1639		do { // NOTE: thread safe lazy init
1640		FFTMultipliers::Builder bld(tab, k+1);
1641		long amt = bld.amt();
1642		if (!amt) break;
1643
1644		long first = k+1-amt;
1645		// initialize entries first..k
1646
1647
1648		for (long s = first; s <= k; s++) {
1649		UniquePtr<FFTVectorPair> item;
1650
1651		if (s == 0) {
1652		bld.move(item); // position 0 not used
1653		continue;
1654		}
1655
1656		if (s == 1) {
1657		item.make();
1658		item->wtab_precomp.SetLength(1);
1659		item->wqinvtab_precomp.SetLength(1);
1660		item->wtab_precomp[0] = 1;
1661		item->wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
1662		bld.move(item);
1663		continue;
1664		}
1665
1666		item.make();
1667		item->wtab_precomp.SetLength(1L << (s-1));
1668		item->wqinvtab_precomp.SetLength(1L << (s-1));
1669
1670		long m = 1L << s;
1671		long m_half = 1L << (s-1);
1672		long m_fourth = 1L << (s-2);
1673
1674		const long *wtab_last = tab[s-1]->wtab_precomp.elts();
1675		const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
1676
1677		long *wtab = item->wtab_precomp.elts();
1678		mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
1679
1680		for (long i = 0; i < m_fourth; i++) {
1681		wtab[i] = wtab_last[i];
1682		wqinvtab[i] = wqinvtab_last[i];
1683		}
1684
1685		long w = root[s];
1686		mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
1687
1688		// prepare wtab...
1689
1690		if (s == 2) {
1691		wtab[1] = MulModPrecon(wtab[0], w, q, wqinv);
1692		wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
1693		}
1694		else {
1695		// some software pipelining
1696		long i, j;
1697
1698		i = m_half-1; j = m_fourth-1;
1699		wtab[i-1] = wtab[j];
1700		wqinvtab[i-1] = wqinvtab[j];
1701		wtab[i] = MulModPrecon(wtab[i-1], w, q, wqinv);
1702
1703		i -= 2; j --;
1704
1705		for (; i >= 0; i -= 2, j --) {
1706		long wp2 = wtab[i+2];
1707		long wm1 = wtab[j];
1708		wqinvtab[i+2] = PrepMulModPrecon(wp2, q, qinv);
1709		wtab[i-1] = wm1;
1710		wqinvtab[i-1] = wqinvtab[j];
1711		wtab[i] = MulModPrecon(wm1, w, q, wqinv);
1712		}
1713
1714		wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
1715		}
1716
1717		bld.move(item);
1718		}
1719		} while (0);
1720		}
1721
1722
1723		// FFT: no lazy, table
1724
1725		void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1726		// performs a 2^k-point convolution modulo q
1727
1728		{
1729		if (!info.bigtab \|\| k > NTL_FFT_BIGTAB_MAXROOT) {
1730		NTL_FFT_ROUTINE_NOTAB(A, a, k, info, dir);
1731		return;
1732		}
1733
1734
1735		long q = info.q;
1736		const long *root = info.RootTable[dir].elts();
1737		mulmod_t qinv = info.qinv;
1738		const FFTMultipliers& tab = info.bigtab->MulTab[dir];
1739
1740
1741		if (k <= 1) {
1742		if (k == 0) {
1743		A[0] = a[0];
1744		return;
1745		}
1746		if (k == 1) {
1747		long a0 = AddMod(a[0], a[1], q);
1748		long a1 = SubMod(a[0], a[1], q);
1749		A[0] = a0;
1750		A[1] = a1;
1751		return;
1752		}
1753		}
1754
1755		// assume k > 1
1756
1757		if (k >= tab.length()) PrecompFFTMultipliers(k, q, qinv, root, tab);
1758
1759		NTL_TLS_LOCAL(Vec<long>, AA_store);
1760		AA_store.SetLength(1L << k);
1761		long *AA = AA_store.elts();
1762
1763		BitReverseCopy(AA, a, k);
1764
1765		long n = 1L << k;
1766
1767		long s, m, m_half, m_fourth, i, j, t, u, t1, u1, tt, tt1;
1768
1769		// s = 1
1770
1771		for (i = 0; i < n; i += 2) {
1772		t = AA[i + 1];
1773		u = AA[i];
1774		AA[i] = AddMod(u, t, q);
1775		AA[i+1] = SubMod(u, t, q);
1776		}
1777
1778
1779		for (s = 2; s < k; s++) {
1780		m = 1L << s;
1781		m_half = 1L << (s-1);
1782		m_fourth = 1L << (s-2);
1783
1784		const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
1785		const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
1786
1787		for (i = 0; i < n; i+= m) {
1788
1789		long * NTL_RESTRICT AA0 = &AA[i];
1790		long * NTL_RESTRICT AA1 = &AA[i + m_half];
1791
1792		#ifdef NTL_PIPELINE
1793
1794		// pipelining: seems to be faster
1795
1796		t = AA1[0];
1797		u = AA0[0];
1798		t1 = MulModPrecon(AA1[1], wtab[1], q, wqinvtab[1]);
1799		u1 = AA0[1];
1800
1801		for (j = 0; j < m_half-2; j += 2) {
1802		long a02 = AA0[j+2];
1803		long a03 = AA0[j+3];
1804		long a12 = AA1[j+2];
1805		long a13 = AA1[j+3];
1806		long w2 = wtab[j+2];
1807		long w3 = wtab[j+3];
1808		mulmod_precon_t wqi2 = wqinvtab[j+2];
1809		mulmod_precon_t wqi3 = wqinvtab[j+3];
1810
1811		tt = MulModPrecon(a12, w2, q, wqi2);
1812		long b00 = AddMod(u, t, q);
1813		long b10 = SubMod(u, t, q);
1814
1815		tt1 = MulModPrecon(a13, w3, q, wqi3);
1816		long b01 = AddMod(u1, t1, q);
1817		long b11 = SubMod(u1, t1, q);
1818
1819		AA0[j] = b00;
1820		AA1[j] = b10;
1821		AA0[j+1] = b01;
1822		AA1[j+1] = b11;
1823
1824
1825		t = tt;
1826		u = a02;
1827		t1 = tt1;
1828		u1 = a03;
1829		}
1830
1831
1832		AA0[j] = AddMod(u, t, q);
1833		AA1[j] = SubMod(u, t, q);
1834		AA0[j + 1] = AddMod(u1, t1, q);
1835		AA1[j + 1] = SubMod(u1, t1, q);
1836		}
1837		#else
1838		for (j = 0; j < m_half; j += 2) {
1839		const long a00 = AA0[j];
1840		const long a01 = AA0[j+1];
1841		const long a10 = AA1[j];
1842		const long a11 = AA1[j+1];
1843
1844		const long w0 = wtab[j];
1845		const long w1 = wtab[j+1];
1846		const mulmod_precon_t wqi0 = wqinvtab[j];
1847		const mulmod_precon_t wqi1 = wqinvtab[j+1];
1848
1849		const long tt = MulModPrecon(a10, w0, q, wqi0);
1850		const long uu = a00;
1851		const long b00 = AddMod(uu, tt, q);
1852		const long b10 = SubMod(uu, tt, q);
1853
1854		const long tt1 = MulModPrecon(a11, w1, q, wqi1);
1855		const long uu1 = a01;
1856		const long b01 = AddMod(uu1, tt1, q);
1857		const long b11 = SubMod(uu1, tt1, q);
1858
1859		AA0[j] = b00;
1860		AA0[j+1] = b01;
1861		AA1[j] = b10;
1862		AA1[j+1] = b11;
1863		}
1864		}
1865		#endif
1866		}
1867
1868
1869		// s == k, special case
1870		{
1871		m = 1L << s;
1872		m_half = 1L << (s-1);
1873		m_fourth = 1L << (s-2);
1874
1875		const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
1876		const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
1877
1878		for (i = 0; i < n; i+= m) {
1879
1880		long * NTL_RESTRICT AA0 = &AA[i];
1881		long * NTL_RESTRICT AA1 = &AA[i + m_half];
1882		long * NTL_RESTRICT A0 = &A[i];
1883		long * NTL_RESTRICT A1 = &A[i + m_half];
1884
1885		#ifdef NTL_PIPELINE
1886
1887		// pipelining: seems to be faster
1888
1889		t = AA1[0];
1890		u = AA0[0];
1891		t1 = MulModPrecon(AA1[1], wtab[1], q, wqinvtab[1]);
1892		u1 = AA0[1];
1893
1894		for (j = 0; j < m_half-2; j += 2) {
1895		long a02 = AA0[j+2];
1896		long a03 = AA0[j+3];
1897		long a12 = AA1[j+2];
1898		long a13 = AA1[j+3];
1899		long w2 = wtab[j+2];
1900		long w3 = wtab[j+3];
1901		mulmod_precon_t wqi2 = wqinvtab[j+2];
1902		mulmod_precon_t wqi3 = wqinvtab[j+3];
1903
1904		tt = MulModPrecon(a12, w2, q, wqi2);
1905		long b00 = AddMod(u, t, q);
1906		long b10 = SubMod(u, t, q);
1907
1908		tt1 = MulModPrecon(a13, w3, q, wqi3);
1909		long b01 = AddMod(u1, t1, q);
1910		long b11 = SubMod(u1, t1, q);
1911
1912		A0[j] = b00;
1913		A1[j] = b10;
1914		A0[j+1] = b01;
1915		A1[j+1] = b11;
1916
1917
1918		t = tt;
1919		u = a02;
1920		t1 = tt1;
1921		u1 = a03;
1922		}
1923
1924
1925		A0[j] = AddMod(u, t, q);
1926		A1[j] = SubMod(u, t, q);
1927		A0[j + 1] = AddMod(u1, t1, q);
1928		A1[j + 1] = SubMod(u1, t1, q);
1929		}
1930		#else
1931		for (j = 0; j < m_half; j += 2) {
1932		const long a00 = AA0[j];
1933		const long a01 = AA0[j+1];
1934		const long a10 = AA1[j];
1935		const long a11 = AA1[j+1];
1936
1937		const long w0 = wtab[j];
1938		const long w1 = wtab[j+1];
1939		const mulmod_precon_t wqi0 = wqinvtab[j];
1940		const mulmod_precon_t wqi1 = wqinvtab[j+1];
1941
1942		const long tt = MulModPrecon(a10, w0, q, wqi0);
1943		const long uu = a00;
1944		const long b00 = AddMod(uu, tt, q);
1945		const long b10 = SubMod(uu, tt, q);
1946
1947		const long tt1 = MulModPrecon(a11, w1, q, wqi1);
1948		const long uu1 = a01;
1949		const long b01 = AddMod(uu1, tt1, q);
1950		const long b11 = SubMod(uu1, tt1, q);
1951
1952		A0[j] = b00;
1953		A0[j+1] = b01;
1954		A1[j] = b10;
1955		A1[j+1] = b11;
1956		}
1957		}
1958		#endif
1959		}
1960
1961		}
1962
1963
1964
1965
1966
1967
1968		#else
1969
1970		// FFT with precomputed tables, lazy mul
1971
1972
1973		static
1974		void LazyPrecompFFTMultipliers(long k, long q, mulmod_t qinv, const long *root, const FFTMultipliers& tab)
	1058	void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const FFTMultipliers& tab)
1975	1059	{
1976	1060	if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
1977	1061

2010	1094	long m_half = 1L << (s-1);
2011	1095	long m_fourth = 1L << (s-2);
2012	1096
2013		const long *wtab_last = tab[s-1]->wtab_precomp.elts();
	1097	const mint_t *wtab_last = tab[s-1]->wtab_precomp.elts();
2014	1098	const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
2015	1099
2016		long *wtab = item->wtab_precomp.elts();
	1100	mint_t *wtab = item->wtab_precomp.elts();
2017	1101	mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
2018	1102
2019	1103	for (long i = 0; i < m_fourth; i++) {

2021	1105	wqinvtab[i] = wqinvtab_last[i];
2022	1106	}
2023	1107
2024		long w = root[s];
	1108	mint_t w = root[s];
2025	1109	mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
2026	1110
2027	1111	// prepare wtab...

2031	1115	wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
2032	1116	}
2033	1117	else {
2034		// some software pipelining
2035	1118	long i, j;
2036	1119
2037	1120	i = m_half-1; j = m_fourth-1;

2042	1125	i -= 2; j --;
2043	1126
2044	1127	for (; i >= 0; i -= 2, j --) {
2045		long wp2 = wtab[i+2];
2046		long wm1 = wtab[j];
	1128	mint_t wp2 = wtab[i+2];
	1129	mint_t wm1 = wtab[j];
2047	1130	wqinvtab[i+2] = LazyPrepMulModPrecon(wp2, q, qinv);
2048	1131	wtab[i-1] = wm1;
2049	1132	wqinvtab[i-1] = wqinvtab[j];

2059	1142	}
2060	1143
2061	1144
2062
2063
2064		#ifdef NTL_BRC_TEST
2065		bool BRC_test_flag = false;
2066		#endif
2067
2068
2069		// FFT: lazy, tables
2070
2071		void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
2072
2073		// performs a 2^k-point convolution modulo q
2074
2075		{
2076		if (!info.bigtab \|\| k > NTL_FFT_BIGTAB_MAXROOT) {
2077		NTL_FFT_ROUTINE_NOTAB(A, a, k, info, dir);
	1145	//===================================================================
	1146
	1147	// TRUNCATED FFT
	1148
	1149	// This code is derived from code originally developed
	1150	// by David Harvey. I include his original documentation,
	1151	// annotated appropriately to highlight differences in
	1152	// the implemebtation (see NOTEs).
	1153
	1154	/*
	1155	The DFT is defined as follows.
	1156
	1157	Let the input sequence be a_0, ..., a_{N-1}.
	1158
	1159	Let w = standard primitive N-th root of 1, i.e. w = g^(2^FFT62_MAX_LGN / N),
	1160	where g = some fixed element of Z/pZ of order 2^FFT62_MAX_LGN.
	1161
	1162	Let Z = an element of (Z/pZ)^* (twisting parameter).
	1163
	1164	Then the output sequence is
	1165	b_j = \sum_{0 <= i < N} Z^i a_i w^(ij'), for 0 <= j < N,
	1166	where j' is the length-lgN bit-reversal of j.
	1167
	1168	Some of the FFT routines can operate on truncated sequences of certain
	1169	"admissible" sizes. A size parameter n is admissible if 1 <= n <= N, and n is
	1170	divisible by a certain power of 2. The precise power depends on the recursive
	1171	array decomposition of the FFT. The smallest admissible n' >= n can be
	1172	obtained via fft62_next_size().
	1173	*/
	1174
	1175	// NOTE: the twising parameter is not implemented.
	1176	// NOTE: the next admissible size function is called FFTRoundUp,
	1177	// and is defined in FFT.h.
	1178
	1179
	1180	/*
	1181	Truncated FFT interface is as follows:
	1182
	1183	xn and yn must be admissible sizes for N.
	1184
	1185	Input in xp[] is a_0, a_1, ..., a_{xn-1}. Assumes a_i = 0 for xn <= i < N.
	1186
	1187	Output in yp[] is b_0, ..., b_{yn-1}, i.e. only first yn outputs are computed.
	1188
	1189	Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
	1190	2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z
	1191	(and lgH is ignored).
	1192
	1193	The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
	1194
	1195	Inputs are in [0, 2p), outputs are in [0, 2p).
	1196
	1197	threads = number of OpenMP threads to use.
	1198	*/
	1199
	1200
	1201
	1202	/*
	1203	Inverse truncated FFT interface is as follows.
	1204
	1205	xn and yn must be admissible sizes for N, with yn <= xn.
	1206
	1207	Input in xp[] is b_0, b_1, ..., b_{yn-1}, Na_{yn}, ..., Na_{xn-1}.
	1208
	1209	Assumes a_i = 0 for xn <= i < N.
	1210
	1211	Output in yp[] is Na_0, ..., Na_{yn-1}.
	1212
	1213	Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
	1214	2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z^(-1)
	1215	(and lgH is ignored).
	1216
	1217	The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
	1218
	1219	Inputs are in [0, 4p), outputs are in [0, 4p).
	1220
	1221	threads = number of OpenMP threads to use.
	1222
	1223	(note: no function actually implements this interface in full generality!
	1224	This is because it is tricky (and not that useful) to implement the twisting
	1225	parameter when xn != yn.)
	1226	*/
	1227
	1228	// NOTE: threads and twisting parameter are not used here.
	1229	// NOTE: the code has been re-written and simplified so that
	1230	// everything is done in place, so xp == yp.
	1231
	1232
	1233
	1234
	1235	//===================================================================
	1236
	1237
	1238
	1239
	1240
	1241
	1242	// NOTE: these could be inlined, but I found the code generation
	1243	// to be extremely sensitive to seemingly trivial changes,
	1244	// so it seems safest to use macros instead.
	1245	// w and wqinv are read only once.
	1246	// q is read several times.
	1247	// xx0, xx1 is are read once and written once
	1248
	1249	#define fwd_butterfly(xx0, xx1, w, q, wqinv) \
	1250	do \
	1251	{ \
	1252	umint_t x0_ = xx0; \
	1253	umint_t x1_ = xx1; \
	1254	umint_t t_ = LazySubMod(x0_, x1_, q); \
	1255	xx0 = LazyAddMod2(x0_, x1_, q); \
	1256	xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
	1257	} \
	1258	while (0)
	1259
	1260	#define fwd_butterfly_neg(xx0, xx1, w, q, wqinv) \
	1261	do \
	1262	{ \
	1263	umint_t x0_ = xx0; \
	1264	umint_t x1_ = xx1; \
	1265	umint_t t_ = LazySubMod(x1_, x0_, q); /* NEG */ \
	1266	xx0 = LazyAddMod2(x0_, x1_, q); \
	1267	xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
	1268	} \
	1269	while (0)
	1270
	1271	#define fwd_butterfly1(xx0, xx1, w, q, wqinv, w1, w1qinv) \
	1272	do \
	1273	{ \
	1274	umint_t x0_ = xx0; \
	1275	umint_t x1_ = xx1; \
	1276	umint_t t_ = LazySubMod(x0_, x1_, q); \
	1277	xx0 = LazyAddMod2(x0_, x1_, q); \
	1278	xx1 = LazyMulModPrecon(LazyMulModPrecon(t_, w1, q, w1qinv), w, q, wqinv); \
	1279	} \
	1280	while (0)
	1281
	1282
	1283	#define fwd_butterfly0(xx0, xx1, q) \
	1284	do \
	1285	{ \
	1286	umint_t x0_ = xx0; \
	1287	umint_t x1_ = xx1; \
	1288	xx0 = LazyAddMod2(x0_, x1_, q); \
	1289	xx1 = LazySubMod2(x0_, x1_, q); \
	1290	} \
	1291	while (0)
	1292
	1293
	1294	#define NTL_NEW_FFT_THRESH (11)
	1295
	1296	struct new_mod_t {
	1297	mint_t q;
	1298	const mint_t **wtab;
	1299	const mulmod_precon_t **wqinvtab;
	1300	};
	1301
	1302
	1303
	1304
	1305
	1306	// requires size divisible by 8
	1307	static void
	1308	new_fft_layer(umint_t* xp, long blocks, long size,
	1309	const mint_t* NTL_RESTRICT wtab,
	1310	const mulmod_precon_t* NTL_RESTRICT wqinvtab,
	1311	mint_t q)
	1312	{
	1313	size /= 2;
	1314
	1315	do
	1316	{
	1317	umint_t* NTL_RESTRICT xp0 = xp;
	1318	umint_t* NTL_RESTRICT xp1 = xp + size;
	1319
	1320	// first 4 butterflies
	1321	fwd_butterfly0(xp0[0+0], xp1[0+0], q);
	1322	fwd_butterfly(xp0[0+1], xp1[0+1], wtab[0+1], q, wqinvtab[0+1]);
	1323	fwd_butterfly(xp0[0+2], xp1[0+2], wtab[0+2], q, wqinvtab[0+2]);
	1324	fwd_butterfly(xp0[0+3], xp1[0+3], wtab[0+3], q, wqinvtab[0+3]);
	1325
	1326	// 4-way unroll
	1327	for (long j = 4; j < size; j += 4) {
	1328	fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
	1329	fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
	1330	fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
	1331	fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
	1332	}
	1333
	1334	xp += 2 * size;
	1335	}
	1336	while (--blocks != 0);
	1337	}
	1338
	1339
	1340	static void
	1341	new_fft_last_two_layers(umint_t* xp, long blocks,
	1342	const mint_t* wtab, const mulmod_precon_t* wqinvtab,
	1343	mint_t q)
	1344	{
	1345	// 4th root of unity
	1346	mint_t w = wtab[1];
	1347	mulmod_precon_t wqinv = wqinvtab[1];
	1348
	1349	do
	1350	{
	1351	umint_t u0 = xp[0];
	1352	umint_t u1 = xp[1];
	1353	umint_t u2 = xp[2];
	1354	umint_t u3 = xp[3];
	1355
	1356	umint_t v0 = LazyAddMod2(u0, u2, q);
	1357	umint_t v2 = LazySubMod2(u0, u2, q);
	1358	umint_t v1 = LazyAddMod2(u1, u3, q);
	1359	umint_t t = LazySubMod(u1, u3, q);
	1360	umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
	1361
	1362	xp[0] = LazyAddMod2(v0, v1, q);
	1363	xp[1] = LazySubMod2(v0, v1, q);
	1364	xp[2] = LazyAddMod2(v2, v3, q);
	1365	xp[3] = LazySubMod2(v2, v3, q);
	1366
	1367	xp += 4;
	1368	}
	1369	while (--blocks != 0);
	1370	}
	1371
	1372
	1373
	1374	void new_fft_base(umint_t* xp, long lgN, const new_mod_t& mod)
	1375	{
	1376	if (lgN == 0) return;
	1377
	1378	mint_t q = mod.q;
	1379
	1380	if (lgN == 1)
	1381	{
	1382	umint_t x0 = xp[0];
	1383	umint_t x1 = xp[1];
	1384	xp[0] = LazyAddMod2(x0, x1, q);
	1385	xp[1] = LazySubMod2(x0, x1, q);
2078	1386	return;
2079		}
2080
2081		long q = info.q;
2082		const long *root = info.RootTable[dir].elts();
2083		mulmod_t qinv = info.qinv;
2084		const FFTMultipliers& tab = info.bigtab->MulTab[dir];
	1387	}
	1388
	1389	const mint_t** wtab = mod.wtab;
	1390	const mulmod_precon_t** wqinvtab = mod.wqinvtab;
	1391
	1392	long N = 1L << lgN;
	1393
	1394	for (long j = lgN, size = N, blocks = 1;
	1395	j > 2; j--, blocks <<= 1, size >>= 1)
	1396	new_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
	1397
	1398	new_fft_last_two_layers(xp, N/4, wtab[2], wqinvtab[2], q);
	1399	}
	1400
	1401
	1402	// Implements the truncated FFT interface, described above.
	1403	// All computations done in place, and xp should point to
	1404	// an array of size N, all of which may be overwitten
	1405	// during the computation.
	1406	static
	1407	void new_fft_short(umint_t* xp, long yn, long xn, long lgN,
	1408	const new_mod_t& mod)
	1409	{
	1410	long N = 1L << lgN;
	1411
	1412	if (yn == N)
	1413	{
	1414	if (xn == N && lgN <= NTL_NEW_FFT_THRESH)
	1415	{
	1416	// no truncation
	1417	new_fft_base(xp, lgN, mod);
	1418	return;
	1419	}
	1420	}
	1421
	1422	// divide-and-conquer algorithm
	1423
	1424	long half = N >> 1;
	1425	mint_t q = mod.q;
	1426
	1427	if (yn <= half)
	1428	{
	1429	if (xn <= half)
	1430	{
	1431	new_fft_short(xp, yn, xn, lgN - 1, mod);
	1432	}
	1433	else
	1434	{
	1435	xn -= half;
	1436
	1437	// (X, Y) -> X + Y
	1438	for (long j = 0; j < xn; j++)
	1439	xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
	1440
	1441	new_fft_short(xp, yn, half, lgN - 1, mod);
	1442	}
	1443	}
	1444	else
	1445	{
	1446	yn -= half;
	1447
	1448	umint_t* NTL_RESTRICT xp0 = xp;
	1449	umint_t* NTL_RESTRICT xp1 = xp + half;
	1450	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
	1451	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
	1452
	1453	if (xn <= half)
	1454	{
	1455	// X -> (X, w*X)
	1456	for (long j = 0; j < xn; j++)
	1457	xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
	1458
	1459	new_fft_short(xp0, half, xn, lgN - 1, mod);
	1460	new_fft_short(xp1, yn, xn, lgN - 1, mod);
	1461	}
	1462	else
	1463	{
	1464	xn -= half;
	1465
	1466	// (X, Y) -> (X + Y, w*(X - Y))
	1467	// DIRT: assumes xn is a multiple of 4
	1468	fwd_butterfly0(xp0[0], xp1[0], q);
	1469	fwd_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
	1470	fwd_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
	1471	fwd_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
	1472	for (long j = 4; j < xn; j+=4) {
	1473	fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
	1474	fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
	1475	fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
	1476	fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
	1477	}
	1478
	1479	// X -> (X, w*X)
	1480	for (long j = xn; j < half; j++)
	1481	xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
	1482
	1483	new_fft_short(xp0, half, half, lgN - 1, mod);
	1484	new_fft_short(xp1, yn, half, lgN - 1, mod);
	1485	}
	1486	}
	1487	}
	1488
	1489	static
	1490	void new_fft_short_notab(umint_t* xp, long yn, long xn, long lgN,
	1491	const new_mod_t& mod, mint_t w, mint_t wqinv)
	1492	// This version assumes that we only have tables up to level lgN-1,
	1493	// and w generates the values at level lgN.
	1494	// DIRT: requires xn even
	1495	{
	1496	long N = 1L << lgN;
	1497
	1498	// divide-and-conquer algorithm
	1499
	1500	long half = N >> 1;
	1501	mint_t q = mod.q;
	1502
	1503	if (yn <= half)
	1504	{
	1505	if (xn <= half)
	1506	{
	1507	new_fft_short(xp, yn, xn, lgN - 1, mod);
	1508	}
	1509	else
	1510	{
	1511	xn -= half;
	1512
	1513	// (X, Y) -> X + Y
	1514	for (long j = 0; j < xn; j++)
	1515	xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
	1516
	1517	new_fft_short(xp, yn, half, lgN - 1, mod);
	1518	}
	1519	}
	1520	else
	1521	{
	1522	yn -= half;
	1523
	1524	umint_t* NTL_RESTRICT xp0 = xp;
	1525	umint_t* NTL_RESTRICT xp1 = xp + half;
	1526	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
	1527	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
	1528
	1529	if (xn <= half)
	1530	{
	1531	// X -> (X, w*X)
	1532	for (long j = 0, j_half = 0; j < xn; j+=2, j_half++) {
	1533	xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
	1534	xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
	1535	wtab[j_half], q, wqinvtab[j_half]);
	1536	}
	1537
	1538	new_fft_short(xp0, half, xn, lgN - 1, mod);
	1539	new_fft_short(xp1, yn, xn, lgN - 1, mod);
	1540	}
	1541	else
	1542	{
	1543	xn -= half;
	1544
	1545	// (X, Y) -> (X + Y, w*(X - Y))
	1546	fwd_butterfly0(xp0[0], xp1[0], q);
	1547	fwd_butterfly(xp0[1], xp1[1], w, q, wqinv);
	1548	long j = 2;
	1549	long j_half = 1;
	1550	for (; j < xn; j+=2, j_half++) {
	1551	fwd_butterfly(xp0[j], xp1[j], wtab[j_half], q, wqinvtab[j_half]);
	1552	fwd_butterfly1(xp0[j+1], xp1[j+1], wtab[j_half], q, wqinvtab[j_half], w, wqinv);
	1553	}
	1554
	1555	// X -> (X, w*X)
	1556	for (; j < half; j+=2, j_half++) {
	1557	xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
	1558	xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
	1559	wtab[j_half], q, wqinvtab[j_half]);
	1560	}
	1561
	1562	new_fft_short(xp0, half, half, lgN - 1, mod);
	1563	new_fft_short(xp1, yn, half, lgN - 1, mod);
	1564	}
	1565	}
	1566	}
	1567
	1568
	1569	//=====
	1570
	1571
	1572	// NOTE: these "flipped" routines perform the same
	1573	// functions as their normal, "unflipped" counter-parts,
	1574	// except that they work with inverted roots.
	1575	// They also perform no truncation, just to keep things simple.
	1576	// All of this is necessary only to implement the UpdateMap
	1577	// routines for ZZ_pX and zz_pX.
	1578
	1579	// requires size divisible by 8
	1580	static void
	1581	new_fft_layer_flipped(umint_t* xp, long blocks, long size,
	1582	const mint_t* wtab,
	1583	const mulmod_precon_t* wqinvtab,
	1584	mint_t q)
	1585	{
	1586	size /= 2;
	1587
	1588	const mint_t* NTL_RESTRICT wtab1 = wtab + size;
	1589	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
	1590
	1591	do
	1592	{
	1593	umint_t* NTL_RESTRICT xp0 = xp;
	1594	umint_t* NTL_RESTRICT xp1 = xp + size;
	1595
	1596	// first 4 butterflies
	1597	fwd_butterfly0(xp0[0+0], xp1[0+0], q);
	1598	fwd_butterfly_neg(xp0[0+1], xp1[0+1], wtab1[-(0+1)], q, wqinvtab1[-(0+1)]);
	1599	fwd_butterfly_neg(xp0[0+2], xp1[0+2], wtab1[-(0+2)], q, wqinvtab1[-(0+2)]);
	1600	fwd_butterfly_neg(xp0[0+3], xp1[0+3], wtab1[-(0+3)], q, wqinvtab1[-(0+3)]);
	1601
	1602	// 4-way unroll
	1603	for (long j = 4; j < size; j += 4) {
	1604	fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
	1605	fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
	1606	fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
	1607	fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
	1608	}
	1609
	1610	xp += 2 * size;
	1611	}
	1612	while (--blocks != 0);
	1613	}
	1614
	1615
	1616
	1617	static void
	1618	new_fft_last_two_layers_flipped(umint_t* xp, long blocks,
	1619	const mint_t* wtab, const mulmod_precon_t* wqinvtab,
	1620	mint_t q)
	1621	{
	1622	// 4th root of unity
	1623	mint_t w = wtab[1];
	1624	mulmod_precon_t wqinv = wqinvtab[1];
	1625
	1626	do
	1627	{
	1628	umint_t u0 = xp[0];
	1629	umint_t u1 = xp[1];
	1630	umint_t u2 = xp[2];
	1631	umint_t u3 = xp[3];
	1632
	1633	umint_t v0 = LazyAddMod2(u0, u2, q);
	1634	umint_t v2 = LazySubMod2(u0, u2, q);
	1635	umint_t v1 = LazyAddMod2(u1, u3, q);
	1636	umint_t t = LazySubMod(u3, u1, q); // NEG
	1637	umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
	1638
	1639	xp[0] = LazyAddMod2(v0, v1, q);
	1640	xp[1] = LazySubMod2(v0, v1, q);
	1641	xp[2] = LazyAddMod2(v2, v3, q);
	1642	xp[3] = LazySubMod2(v2, v3, q);
	1643
	1644	xp += 4;
	1645	}
	1646	while (--blocks != 0);
	1647	}
	1648
	1649
	1650
	1651	void new_fft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
	1652	{
	1653	if (lgN == 0) return;
	1654
	1655	mint_t q = mod.q;
	1656
	1657	if (lgN == 1)
	1658	{
	1659	umint_t x0 = xp[0];
	1660	umint_t x1 = xp[1];
	1661	xp[0] = LazyAddMod2(x0, x1, q);
	1662	xp[1] = LazySubMod2(x0, x1, q);
	1663	return;
	1664	}
	1665
	1666	const mint_t** wtab = mod.wtab;
	1667	const mulmod_precon_t** wqinvtab = mod.wqinvtab;
	1668
	1669	long N = 1L << lgN;
	1670
	1671	for (long j = lgN, size = N, blocks = 1;
	1672	j > 2; j--, blocks <<= 1, size >>= 1)
	1673	new_fft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
	1674
	1675	new_fft_last_two_layers_flipped(xp, N/4, wtab[2], wqinvtab[2], q);
	1676	}
	1677
	1678
	1679	static
	1680	void new_fft_short_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
	1681	{
	1682	long N = 1L << lgN;
	1683
	1684	if (lgN <= NTL_NEW_FFT_THRESH)
	1685	{
	1686	new_fft_base_flipped(xp, lgN, mod);
	1687	return;
	1688	}
	1689
	1690	// divide-and-conquer algorithm
	1691
	1692	long half = N >> 1;
	1693	mint_t q = mod.q;
	1694
	1695	umint_t* NTL_RESTRICT xp0 = xp;
	1696	umint_t* NTL_RESTRICT xp1 = xp + half;
	1697	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN] + half;
	1698	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN] + half;
	1699
	1700	// (X, Y) -> (X + Y, w*(X - Y))
	1701
	1702	fwd_butterfly0(xp0[0], xp1[0], q);
	1703	fwd_butterfly_neg(xp0[1], xp1[1], wtab[-1], q, wqinvtab[-1]);
	1704	fwd_butterfly_neg(xp0[2], xp1[2], wtab[-2], q, wqinvtab[-2]);
	1705	fwd_butterfly_neg(xp0[3], xp1[3], wtab[-3], q, wqinvtab[-3]);
	1706	for (long j = 4; j < half; j+=4) {
	1707	fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab[-(j+0)], q, wqinvtab[-(j+0)]);
	1708	fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab[-(j+1)], q, wqinvtab[-(j+1)]);
	1709	fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab[-(j+2)], q, wqinvtab[-(j+2)]);
	1710	fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab[-(j+3)], q, wqinvtab[-(j+3)]);
	1711	}
	1712
	1713	new_fft_short_flipped(xp0, lgN - 1, mod);
	1714	new_fft_short_flipped(xp1, lgN - 1, mod);
	1715	}
	1716
	1717
	1718
	1719	// IFFT (inverse truncated FFT)
	1720
	1721
	1722	#define inv_butterfly0(xx0, xx1, q) \
	1723	do \
	1724	{ \
	1725	umint_t x0_ = LazyReduce2(xx0, q); \
	1726	umint_t x1_ = LazyReduce2(xx1, q); \
	1727	xx0 = LazyAddMod(x0_, x1_, q); \
	1728	xx1 = LazySubMod(x0_, x1_, q); \
	1729	} while (0)
	1730
	1731
	1732	#define inv_butterfly_neg(xx0, xx1, w, q, wqinv) \
	1733	do \
	1734	{ \
	1735	umint_t x0_ = LazyReduce2(xx0, q); \
	1736	umint_t x1_ = xx1; \
	1737	umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
	1738	xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
	1739	xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
	1740	} while (0)
	1741
	1742	#define inv_butterfly(xx0, xx1, w, q, wqinv) \
	1743	do \
	1744	{ \
	1745	umint_t x0_ = LazyReduce2(xx0, q); \
	1746	umint_t x1_ = xx1; \
	1747	umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
	1748	xx0 = LazyAddMod(x0_, t_, q); \
	1749	xx1 = LazySubMod(x0_, t_, q); \
	1750	} while (0)
	1751
	1752	#define inv_butterfly1_neg(xx0, xx1, w, q, wqinv, w1, w1qinv) \
	1753	do \
	1754	{ \
	1755	umint_t x0_ = LazyReduce2(xx0, q); \
	1756	umint_t x1_ = xx1; \
	1757	umint_t t_ = LazyMulModPrecon(LazyMulModPrecon(x1_, w1, q, w1qinv), w, q, wqinv); \
	1758	xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
	1759	xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
	1760	} while (0)
	1761
	1762
	1763	static
	1764	void new_ifft_short2(umint_t* yp, long yn, long lgN, const new_mod_t& mod);
	1765
	1766
	1767
	1768	// requires size divisible by 8
	1769	static void
	1770	new_ifft_layer(umint_t* xp, long blocks, long size,
	1771	const mint_t* wtab,
	1772	const mulmod_precon_t* wqinvtab, mint_t q)
	1773	{
	1774
	1775	size /= 2;
	1776	const mint_t* NTL_RESTRICT wtab1 = wtab + size;
	1777	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
	1778
	1779	do
	1780	{
	1781
	1782	umint_t* NTL_RESTRICT xp0 = xp;
	1783	umint_t* NTL_RESTRICT xp1 = xp + size;
	1784
	1785
	1786	// first 4 butterflies
	1787	inv_butterfly0(xp0[0], xp1[0], q);
	1788	inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
	1789	inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
	1790	inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
	1791
	1792	// 4-way unroll
	1793	for (long j = 4; j < size; j+= 4) {
	1794	inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
	1795	inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
	1796	inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
	1797	inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
	1798	}
	1799
	1800	xp += 2 * size;
	1801	}
	1802	while (--blocks != 0);
	1803	}
	1804
	1805
	1806	static void
	1807	new_ifft_first_two_layers(umint_t* xp, long blocks, const mint_t* wtab,
	1808	const mulmod_precon_t* wqinvtab, mint_t q)
	1809	{
	1810	// 4th root of unity
	1811	mint_t w = wtab[1];
	1812	mulmod_precon_t wqinv = wqinvtab[1];
	1813
	1814	do
	1815	{
	1816	umint_t u0 = LazyReduce2(xp[0], q);
	1817	umint_t u1 = LazyReduce2(xp[1], q);
	1818	umint_t u2 = LazyReduce2(xp[2], q);
	1819	umint_t u3 = LazyReduce2(xp[3], q);
	1820
	1821	umint_t v0 = LazyAddMod2(u0, u1, q);
	1822	umint_t v1 = LazySubMod2(u0, u1, q);
	1823	umint_t v2 = LazyAddMod2(u2, u3, q);
	1824	umint_t t = LazySubMod(u2, u3, q);
	1825	umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
	1826
	1827	xp[0] = LazyAddMod(v0, v2, q);
	1828	xp[2] = LazySubMod(v0, v2, q);
	1829	xp[1] = LazySubMod(v1, v3, q); // NEG
	1830	xp[3] = LazyAddMod(v1, v3, q); // NEG
	1831
	1832	xp += 4;
	1833	}
	1834	while (--blocks != 0);
	1835	}
	1836
	1837
	1838
	1839	static
	1840	void new_ifft_base(umint_t* xp, long lgN, const new_mod_t& mod)
	1841	{
	1842	if (lgN == 0) return;
	1843
	1844	mint_t q = mod.q;
	1845
	1846	if (lgN == 1)
	1847	{
	1848	umint_t x0 = LazyReduce2(xp[0], q);
	1849	umint_t x1 = LazyReduce2(xp[1], q);
	1850	xp[0] = LazyAddMod(x0, x1, q);
	1851	xp[1] = LazySubMod(x0, x1, q);
	1852	return;
	1853	}
	1854
	1855	const mint_t** wtab = mod.wtab;
	1856	const mulmod_precon_t** wqinvtab = mod.wqinvtab;
	1857
	1858	long blocks = 1L << (lgN - 2);
	1859	new_ifft_first_two_layers(xp, blocks, wtab[2], wqinvtab[2], q);
	1860	blocks >>= 1;
	1861
	1862	long size = 8;
	1863	for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
	1864	new_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
	1865	}
	1866
	1867
	1868	static
	1869	void new_ifft_short1(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
	1870
	1871	// Implements truncated inverse FFT interface, but with xn==yn.
	1872	// All computations are done in place.
	1873
	1874	{
	1875	long N = 1L << lgN;
	1876
	1877	if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
	1878	{
	1879	// no truncation
	1880	new_ifft_base(xp, lgN, mod);
	1881	return;
	1882	}
	1883
	1884	// divide-and-conquer algorithm
	1885
	1886	long half = N >> 1;
	1887	mint_t q = mod.q;
	1888
	1889	if (yn <= half)
	1890	{
	1891	// X -> 2X
	1892	for (long j = 0; j < yn; j++)
	1893	xp[j] = LazyDoubleMod4(xp[j], q);
	1894
	1895	new_ifft_short1(xp, yn, lgN - 1, mod);
	1896	}
	1897	else
	1898	{
	1899	umint_t* NTL_RESTRICT xp0 = xp;
	1900	umint_t* NTL_RESTRICT xp1 = xp + half;
	1901	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
	1902	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
	1903
	1904	new_ifft_short1(xp0, half, lgN - 1, mod);
	1905
	1906	yn -= half;
	1907
	1908	// X -> (2X, w*X)
	1909	for (long j = yn; j < half; j++)
	1910	{
	1911	umint_t x0 = xp0[j];
	1912	xp0[j] = LazyDoubleMod4(x0, q);
	1913	xp1[j] = LazyMulModPrecon(x0, wtab[j], q, wqinvtab[j]);
	1914	}
	1915
	1916	new_ifft_short2(xp1, yn, lgN - 1, mod);
	1917
	1918	// (X, Y) -> (X + Y/w, X - Y/w)
	1919	{
	1920	const mint_t* NTL_RESTRICT wtab1 = wtab + half;
	1921	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
	1922
	1923	// DIRT: assumes yn is a multiple of 4
	1924	inv_butterfly0(xp0[0], xp1[0], q);
	1925	inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
	1926	inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
	1927	inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
	1928	for (long j = 4; j < yn; j+=4) {
	1929	inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
	1930	inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
	1931	inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
	1932	inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
	1933	}
	1934	}
	1935	}
	1936	}
	1937
	1938
	1939	static
	1940	void new_ifft_short1_notab(umint_t* xp, long yn, long lgN, const new_mod_t& mod,
	1941	mint_t w, mulmod_precon_t wqinv,
	1942	mint_t iw, mulmod_precon_t iwqinv)
	1943	// This version assumes that we only have tables up to level lgN-1,
	1944	// and w generates the values at level lgN.
	1945	// DIRT: requires yn even
	1946	{
	1947	long N = 1L << lgN;
	1948
	1949	// divide-and-conquer algorithm
	1950
	1951	long half = N >> 1;
	1952	mint_t q = mod.q;
	1953
	1954	if (yn <= half)
	1955	{
	1956	// X -> 2X
	1957	for (long j = 0; j < yn; j++)
	1958	xp[j] = LazyDoubleMod4(xp[j], q);
	1959
	1960	new_ifft_short1(xp, yn, lgN - 1, mod);
	1961	}
	1962	else
	1963	{
	1964	umint_t* NTL_RESTRICT xp0 = xp;
	1965	umint_t* NTL_RESTRICT xp1 = xp + half;
	1966	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
	1967	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
	1968
	1969	new_ifft_short1(xp0, half, lgN - 1, mod);
	1970
	1971	yn -= half;
	1972
	1973	// X -> (2X, w*X)
	1974	for (long j = yn, j_half = yn/2; j < half; j+=2, j_half++) {
	1975	{
	1976	umint_t x0 = xp0[j+0];
	1977	xp0[j+0] = LazyDoubleMod4(x0, q);
	1978	xp1[j+0] = LazyMulModPrecon(x0, wtab[j_half], q, wqinvtab[j_half]);
	1979	}
	1980	{
	1981	umint_t x0 = xp0[j+1];
	1982	xp0[j+1] = LazyDoubleMod4(x0, q);
	1983	xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(x0, w, q, wqinv),
	1984	wtab[j_half], q, wqinvtab[j_half]);
	1985	}
	1986	}
	1987
	1988	new_ifft_short2(xp1, yn, lgN - 1, mod);
	1989
	1990	// (X, Y) -> (X + Y/w, X - Y/w)
	1991	{
	1992	const mint_t* NTL_RESTRICT wtab1 = wtab + half/2;
	1993	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half/2;
	1994
	1995	inv_butterfly0(xp0[0], xp1[0], q);
	1996	inv_butterfly(xp0[1], xp1[1], iw, q, iwqinv);
	1997	for (long j = 2, j_half = 1; j < yn; j+=2, j_half++) {
	1998	inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-j_half], q, wqinvtab1[-j_half]);
	1999	inv_butterfly1_neg(xp0[j+1], xp1[j+1], wtab1[-j_half], q, wqinvtab1[-j_half], iw, iwqinv);
	2000	}
	2001	}
	2002	}
	2003	}
	2004
	2005
	2006
	2007	//=========
	2008
	2009
	2010	// requires size divisible by 8
	2011	static void
	2012	new_ifft_layer_flipped(umint_t* xp, long blocks, long size,
	2013	const mint_t* NTL_RESTRICT wtab,
	2014	const mulmod_precon_t* NTL_RESTRICT wqinvtab, mint_t q)
	2015	{
	2016
	2017	size /= 2;
	2018
	2019	do
	2020	{
	2021
	2022	umint_t* NTL_RESTRICT xp0 = xp;
	2023	umint_t* NTL_RESTRICT xp1 = xp + size;
	2024
	2025
	2026	// first 4 butterflies
	2027	inv_butterfly0(xp0[0], xp1[0], q);
	2028	inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
	2029	inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
	2030	inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
	2031
	2032	// 4-way unroll
	2033	for (long j = 4; j < size; j+= 4) {
	2034	inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
	2035	inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
	2036	inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
	2037	inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
	2038	}
	2039
	2040	xp += 2 * size;
	2041	}
	2042	while (--blocks != 0);
	2043	}
	2044
	2045
	2046	static void
	2047	new_ifft_first_two_layers_flipped(umint_t* xp, long blocks, const mint_t* wtab,
	2048	const mulmod_precon_t* wqinvtab, mint_t q)
	2049	{
	2050	// 4th root of unity
	2051	mint_t w = wtab[1];
	2052	mulmod_precon_t wqinv = wqinvtab[1];
	2053
	2054	do
	2055	{
	2056	umint_t u0 = LazyReduce2(xp[0], q);
	2057	umint_t u1 = LazyReduce2(xp[1], q);
	2058	umint_t u2 = LazyReduce2(xp[2], q);
	2059	umint_t u3 = LazyReduce2(xp[3], q);
	2060
	2061	umint_t v0 = LazyAddMod2(u0, u1, q);
	2062	umint_t v1 = LazySubMod2(u0, u1, q);
	2063	umint_t v2 = LazyAddMod2(u2, u3, q);
	2064	umint_t t = LazySubMod(u2, u3, q);
	2065	umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
	2066
	2067	xp[0] = LazyAddMod(v0, v2, q);
	2068	xp[2] = LazySubMod(v0, v2, q);
	2069	xp[1] = LazyAddMod(v1, v3, q);
	2070	xp[3] = LazySubMod(v1, v3, q);
	2071
	2072	xp += 4;
	2073	}
	2074	while (--blocks != 0);
	2075	}
	2076
	2077
	2078
	2079	static
	2080	void new_ifft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
	2081	{
	2082	if (lgN == 0) return;
	2083
	2084	mint_t q = mod.q;
	2085
	2086	if (lgN == 1)
	2087	{
	2088	umint_t x0 = LazyReduce2(xp[0], q);
	2089	umint_t x1 = LazyReduce2(xp[1], q);
	2090	xp[0] = LazyAddMod(x0, x1, q);
	2091	xp[1] = LazySubMod(x0, x1, q);
	2092	return;
	2093	}
	2094
	2095	const mint_t** wtab = mod.wtab;
	2096	const mulmod_precon_t** wqinvtab = mod.wqinvtab;
	2097
	2098	long blocks = 1L << (lgN - 2);
	2099	new_ifft_first_two_layers_flipped(xp, blocks, wtab[2], wqinvtab[2], q);
	2100	blocks >>= 1;
	2101
	2102	long size = 8;
	2103	for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
	2104	new_ifft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
	2105	}
	2106
	2107
	2108	static
	2109	void new_ifft_short1_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
	2110	{
	2111	long N = 1L << lgN;
	2112
	2113	if (lgN <= NTL_NEW_FFT_THRESH)
	2114	{
	2115	new_ifft_base_flipped(xp, lgN, mod);
	2116	return;
	2117	}
	2118
	2119	// divide-and-conquer algorithm
	2120
	2121	long half = N >> 1;
	2122	mint_t q = mod.q;
	2123
	2124	umint_t* NTL_RESTRICT xp0 = xp;
	2125	umint_t* NTL_RESTRICT xp1 = xp + half;
	2126	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
	2127	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
	2128
	2129	new_ifft_short1_flipped(xp0, lgN - 1, mod);
	2130	new_ifft_short1_flipped(xp1, lgN - 1, mod);
	2131
	2132	// (X, Y) -> (X + Yw, X - Yw)
	2133
	2134	inv_butterfly0(xp0[0], xp1[0], q);
	2135	inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
	2136	inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
	2137	inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
	2138	for (long j = 4; j < half; j+=4) {
	2139	inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
	2140	inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
	2141	inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
	2142	inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
	2143	}
	2144	}
	2145
	2146	//=========
	2147
	2148
	2149
	2150	static
	2151	void new_ifft_short2(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
	2152
	2153	// Implements truncated inverse FFT interface, but with xn==N.
	2154	// All computations are done in place.
	2155
	2156	{
	2157	long N = 1L << lgN;
	2158
	2159	if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
	2160	{
	2161	// no truncation
	2162	new_ifft_base(xp, lgN, mod);
	2163	return;
	2164	}
	2165
	2166	// divide-and-conquer algorithm
	2167
	2168	long half = N >> 1;
	2169	mint_t q = mod.q;
	2170
	2171	if (yn <= half)
	2172	{
	2173	// X -> 2X
	2174	for (long j = 0; j < yn; j++)
	2175	xp[j] = LazyDoubleMod4(xp[j], q);
	2176	// (X, Y) -> X + Y
	2177	for (long j = yn; j < half; j++)
	2178	xp[j] = LazyAddMod4(xp[j], xp[j + half], q);
	2179
	2180	new_ifft_short2(xp, yn, lgN - 1, mod);
	2181
	2182	// (X, Y) -> X - Y
	2183	for (long j = 0; j < yn; j++)
	2184	xp[j] = LazySubMod4(xp[j], xp[j + half], q);
	2185	}
	2186	else
	2187	{
	2188	umint_t* NTL_RESTRICT xp0 = xp;
	2189	umint_t* NTL_RESTRICT xp1 = xp + half;
	2190	const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
	2191	const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
	2192
	2193	new_ifft_short1(xp0, half, lgN - 1, mod);
	2194
	2195	yn -= half;
	2196
	2197
	2198	// (X, Y) -> (2X - Y, w*(X - Y))
	2199	for (long j = yn; j < half; j++)
	2200	{
	2201	umint_t x0 = xp0[j];
	2202	umint_t x1 = xp1[j];
	2203	umint_t u = LazySubMod4(x0, x1, q);
	2204	xp0[j] = LazyAddMod4(x0, u, q);
	2205	xp1[j] = LazyMulModPrecon(u, wtab[j], q, wqinvtab[j]);
	2206	}
	2207
	2208	new_ifft_short2(xp1, yn, lgN - 1, mod);
	2209
	2210	// (X, Y) -> (X + Y/w, X - Y/w)
	2211	{
	2212	const mint_t* NTL_RESTRICT wtab1 = wtab + half;
	2213	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
	2214
	2215	// DIRT: assumes yn is a multiple of 4
	2216	inv_butterfly0(xp0[0], xp1[0], q);
	2217	inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
	2218	inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
	2219	inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
	2220	for (long j = 4; j < yn; j+=4) {
	2221	inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
	2222	inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
	2223	inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
	2224	inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
	2225	}
	2226	}
	2227	}
	2228	}
	2229
	2230
	2231	//=============================================
	2232
	2233	// HIGH LEVEL ROUTINES
	2234
	2235	//=========== FFT without tables ===========
	2236
	2237
	2238	NTL_TLS_GLOBAL_DECL(Vec<umint_t>, AA_store)
	2239
	2240	NTL_TLS_GLOBAL_DECL(Vec<FFTVectorPair>, mul_vec)
	2241
	2242	void new_fft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	2243	long yn, long xn)
	2244
	2245	// Performs a high-level FFT. Inputs and outputs are in the range [0,q).
	2246	// xn and yn are as described above in the truncated FFT interface.
	2247	// Both A and a should point to arrays of size 2^k,
	2248	// and should either be the same or not overlap at all.
	2249	// This version does not use precomputed tables.
	2250
	2251	{
	2252	mint_t q = info.q;
2085	2253
2086	2254	if (k <= 1) {
2087	2255	if (k == 0) {

2089	2257	return;
2090	2258	}
2091	2259	if (k == 1) {
2092		long a0 = AddMod(a[0], a[1], q);
2093		long a1 = SubMod(a[0], a[1], q);
2094		A[0] = a0;
2095		A[1] = a1;
	2260	mint_t A0 = AddMod(a[0], a[1], q);
	2261	mint_t A1 = SubMod(a[0], a[1], q);
	2262	A[0] = A0;
	2263	A[1] = A1;
2096	2264	return;
2097	2265	}
2098	2266	}
2099	2267
2100	2268	// assume k > 1
	2269	const mint_t *root = info.RootTable[0].elts();
	2270	mulmod_t qinv = info.qinv;
	2271
	2272	NTL_TLS_GLOBAL_ACCESS(mul_vec);
	2273	ComputeMultipliers(mul_vec, k-1, q, qinv, root);
	2274
	2275	long n = 1L << k;
	2276
	2277	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2278	for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
	2279
	2280	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2281	for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
	2282
	2283	new_mod_t mod;
	2284	mod.q = q;
	2285	mod.wtab = &wtab[0];
	2286	mod.wqinvtab = &wqinvtab[0];
	2287
	2288	mint_t w = info.RootTable[0][k];
	2289	mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
	2290
	2291	#ifdef NTL_FFT_USEBUF
	2292	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2293	AA_store.SetLength(1L << k);
	2294	umint_t *AA = AA_store.elts();
	2295
	2296	for (long i = 0; i < xn; i++) AA[i] = a[i];
	2297
	2298	new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
	2299
	2300	for (long i = 0; i < yn; i++) {
	2301	A[i] = LazyReduce1(AA[i], q);
	2302	}
	2303	#else
	2304	umint_t AA = (umint_t ) A;
	2305	if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
	2306
	2307	new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
	2308
	2309	for (long i = 0; i < yn; i++) {
	2310	AA[i] = LazyReduce1(AA[i], q);
	2311	}
	2312	#endif
	2313	}
	2314
	2315
	2316	void new_fft_flipped_notab(mint_t* A, const mint_t* a, long k,
	2317	const FFTPrimeInfo& info)
	2318
	2319	// Performs a high-level FFT. Inputs and outputs are in the range [0,q).
	2320	// Both A and a should point to arrays of size 2^k,
	2321	// and should either be the same or not overlap at all.
	2322	// This version is "flipped" -- it uses inverted roots,
	2323	// multiplies by 2^{-k}, and performs no truncations.
	2324	// This version does not use precomputed tables.
	2325
	2326	{
	2327	mint_t q = info.q;
	2328
	2329	if (k <= 1) {
	2330	if (k == 0) {
	2331	A[0] = a[0];
	2332	return;
	2333	}
	2334	if (k == 1) {
	2335	mint_t two_inv = info.TwoInvTable[1];
	2336	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
	2337	mint_t A0 = AddMod(a[0], a[1], q);
	2338	mint_t A1 = SubMod(a[0], a[1], q);
	2339	A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
	2340	A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
	2341	return;
	2342	}
	2343	}
	2344
	2345	// assume k > 1
	2346	const mint_t *root = info.RootTable[1].elts();
	2347	mulmod_t qinv = info.qinv;
	2348
	2349	NTL_TLS_GLOBAL_ACCESS(mul_vec);
	2350	ComputeMultipliers(mul_vec, k-1, q, qinv, root);
	2351
	2352	long n = 1L << k;
	2353
	2354	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2355	for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
	2356
	2357	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2358	for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
	2359
	2360	new_mod_t mod;
	2361	mod.q = q;
	2362	mod.wtab = &wtab[0];
	2363	mod.wqinvtab = &wqinvtab[0];
	2364
	2365	mint_t w = info.RootTable[1][k];
	2366	mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
	2367
	2368	mint_t two_inv = info.TwoInvTable[k];
	2369	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
	2370
	2371	#ifdef NTL_FFT_USEBUF
	2372	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2373	AA_store.SetLength(1L << k);
	2374	umint_t *AA = AA_store.elts();
	2375
	2376	for (long i = 0; i < n; i++) AA[i] = a[i];
	2377
	2378	new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
	2379
	2380	for (long i = 0; i < n; i++) {
	2381	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2382	A[i] = LazyReduce1(tmp, q);
	2383	}
	2384	#else
	2385	umint_t AA = (umint_t ) A;
	2386	if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
	2387
	2388	new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
	2389
	2390	for (long i = 0; i < n; i++) {
	2391	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2392	AA[i] = LazyReduce1(tmp, q);
	2393	}
	2394
	2395	#endif
	2396	}
	2397
	2398
	2399	//=========== Inverse FFT without tables ===========
	2400
	2401	void new_ifft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	2402	long yn)
	2403
	2404	// Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
	2405	// yn==xn are as described above in the truncated FFT interface.
	2406	// Both A and a should point to arrays of size 2^k,
	2407	// and should either be the same or not overlap at all.
	2408	// Multiplies by 2^{-k}.
	2409	// This version does not use precomputed tables.
	2410
	2411	{
	2412	mint_t q = info.q;
	2413
	2414	if (k <= 1) {
	2415	if (k == 0) {
	2416	A[0] = a[0];
	2417	return;
	2418	}
	2419	if (k == 1) {
	2420	mint_t two_inv = info.TwoInvTable[1];
	2421	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
	2422	mint_t A0 = AddMod(a[0], a[1], q);
	2423	mint_t A1 = SubMod(a[0], a[1], q);
	2424	A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
	2425	A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
	2426	return;
	2427	}
	2428	}
	2429
	2430	// assume k > 1
	2431	const mint_t *root = info.RootTable[0].elts();
	2432	mulmod_t qinv = info.qinv;
	2433
	2434	NTL_TLS_GLOBAL_ACCESS(mul_vec);
	2435	ComputeMultipliers(mul_vec, k-1, q, qinv, root);
	2436
	2437	long n = 1L << k;
	2438
	2439	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2440	for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
	2441
	2442	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2443	for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
	2444
	2445	new_mod_t mod;
	2446	mod.q = q;
	2447	mod.wtab = &wtab[0];
	2448	mod.wqinvtab = &wqinvtab[0];
	2449
	2450
	2451	mint_t w = info.RootTable[0][k];
	2452	mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
	2453
	2454	mint_t iw = info.RootTable[1][k];
	2455	mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
	2456
	2457	mint_t two_inv = info.TwoInvTable[k];
	2458	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
	2459
	2460	#ifdef NTL_FFT_USEBUF
	2461	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2462	AA_store.SetLength(1L << k);
	2463	umint_t *AA = AA_store.elts();
	2464
	2465	for (long i = 0; i < yn; i++) AA[i] = a[i];
	2466
	2467	new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
	2468
	2469	for (long i = 0; i < yn; i++) {
	2470	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2471	A[i] = LazyReduce1(tmp, q);
	2472	}
	2473	#else
	2474	umint_t AA = (umint_t ) A;
	2475	if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
	2476
	2477	new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
	2478
	2479	for (long i = 0; i < yn; i++) {
	2480	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2481	AA[i] = LazyReduce1(tmp, q);
	2482	}
	2483
	2484	#endif
	2485	}
	2486
	2487
	2488	void new_ifft_flipped_notab(mint_t* A, const mint_t* a, long k,
	2489	const FFTPrimeInfo& info)
	2490
	2491	// Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
	2492	// Flipped means inverse roots are used an no truncation and
	2493	// no multiplication by 2^{-k}.
	2494	// Both A and a should point to arrays of size 2^k,
	2495	// and should either be the same or not overlap at all.
	2496	// This version does not use precomputed tables.
	2497
	2498	{
	2499	mint_t q = info.q;
	2500
	2501	if (k <= 1) {
	2502	if (k == 0) {
	2503	A[0] = a[0];
	2504	return;
	2505	}
	2506	if (k == 1) {
	2507	mint_t A0 = AddMod(a[0], a[1], q);
	2508	mint_t A1 = SubMod(a[0], a[1], q);
	2509	A[0] = A0;
	2510	A[1] = A1;
	2511	return;
	2512	}
	2513	}
	2514
	2515	// assume k > 1
	2516	const mint_t *root = info.RootTable[1].elts();
	2517	mulmod_t qinv = info.qinv;
	2518
	2519	NTL_TLS_GLOBAL_ACCESS(mul_vec);
	2520	ComputeMultipliers(mul_vec, k-1, q, qinv, root);
	2521
	2522	long n = 1L << k;
	2523
	2524	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2525	for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
	2526
	2527	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2528	for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
	2529
	2530	new_mod_t mod;
	2531	mod.q = q;
	2532	mod.wtab = &wtab[0];
	2533	mod.wqinvtab = &wqinvtab[0];
	2534
	2535	mint_t w = info.RootTable[1][k];
	2536	mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
	2537
	2538	mint_t iw = info.RootTable[0][k];
	2539	mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
	2540
	2541	#ifdef NTL_FFT_USEBUF
	2542	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2543	AA_store.SetLength(1L << k);
	2544	umint_t *AA = AA_store.elts();
	2545
	2546	for (long i = 0; i < n; i++) AA[i] = a[i];
	2547
	2548
	2549	new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
	2550
	2551	for (long i = 0; i < n; i++) {
	2552	umint_t tmp = LazyReduce2(AA[i], q);
	2553	A[i] = LazyReduce1(tmp, q);
	2554	}
	2555	#else
	2556	umint_t AA = (umint_t ) A;
	2557	if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
	2558
	2559	new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
	2560
	2561	for (long i = 0; i < n; i++) {
	2562	umint_t tmp = LazyReduce2(AA[i], q);
	2563	AA[i] = LazyReduce1(tmp, q);
	2564	}
	2565	#endif
	2566	}
	2567
	2568
	2569	#ifndef NTL_ENABLE_AVX_FFT
	2570
	2571	//================ FFT with tables ==============
	2572
	2573
	2574	void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	2575	long yn, long xn)
	2576
	2577	// Performs a high-level FFT. Inputs and outputs are in the range [0,q).
	2578	// xn and yn are as described above in the truncated FFT interface.
	2579	// Both A and a should point to arrays of size 2^k,
	2580	// and should either be the same or not overlap at all.
	2581
	2582	{
	2583	if (!info.bigtab \|\| k > info.bigtab->bound) {
	2584	new_fft_notab(A, a, k, info, yn, xn);
	2585	return;
	2586	}
	2587
	2588	mint_t q = info.q;
	2589
	2590	if (k <= 1) {
	2591	if (k == 0) {
	2592	A[0] = a[0];
	2593	return;
	2594	}
	2595	if (k == 1) {
	2596	mint_t A0 = AddMod(a[0], a[1], q);
	2597	mint_t A1 = SubMod(a[0], a[1], q);
	2598	A[0] = A0;
	2599	A[1] = A1;
	2600	return;
	2601	}
	2602	}
	2603
	2604	// assume k > 1
	2605	const mint_t *root = info.RootTable[0].elts();
	2606	mulmod_t qinv = info.qinv;
	2607	const FFTMultipliers& tab = info.bigtab->MulTab;
2101	2608
2102	2609	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2103	2610
2104		NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
	2611
	2612	long n = 1L << k;
	2613
	2614
	2615	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2616	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	2617
	2618	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2619	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	2620
	2621	new_mod_t mod;
	2622	mod.q = q;
	2623	mod.wtab = &wtab[0];
	2624	mod.wqinvtab = &wqinvtab[0];
	2625
	2626
	2627
	2628	#ifdef NTL_FFT_USEBUF
	2629	NTL_TLS_GLOBAL_ACCESS(AA_store);
2105	2630	AA_store.SetLength(1L << k);
2106		unsigned long *AA = AA_store.elts();
	2631	umint_t *AA = AA_store.elts();
	2632
	2633	for (long i = 0; i < xn; i++) AA[i] = a[i];
	2634
	2635	new_fft_short(AA, yn, xn, k, mod);
	2636
	2637	for (long i = 0; i < yn; i++) {
	2638	A[i] = LazyReduce1(AA[i], q);
	2639	}
	2640	#else
	2641	umint_t AA = (umint_t ) A;
	2642	if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
	2643
	2644	new_fft_short(AA, yn, xn, k, mod);
	2645
	2646	for (long i = 0; i < yn; i++) {
	2647	AA[i] = LazyReduce1(AA[i], q);
	2648	}
	2649	#endif
	2650
	2651	}
	2652
	2653	void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
	2654
	2655	// Performs a high-level FFT. Inputs and outputs are in the range [0,q).
	2656	// Both A and a should point to arrays of size 2^k,
	2657	// and should either be the same or not overlap at all.
	2658	// This version is "flipped" -- it uses inverted roots,
	2659	// multiplies by 2^{-k}, and performs no truncations.
	2660
	2661	{
	2662	if (!info.bigtab \|\| k > info.bigtab->bound) {
	2663	new_fft_flipped_notab(A, a, k, info);
	2664	return;
	2665	}
	2666
	2667	mint_t q = info.q;
	2668
	2669	if (k <= 1) {
	2670	if (k == 0) {
	2671	A[0] = a[0];
	2672	return;
	2673	}
	2674	if (k == 1) {
	2675	mint_t two_inv = info.TwoInvTable[1];
	2676	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
	2677	mint_t A0 = AddMod(a[0], a[1], q);
	2678	mint_t A1 = SubMod(a[0], a[1], q);
	2679	A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
	2680	A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
	2681	return;
	2682	}
	2683	}
	2684
	2685	// assume k > 1
	2686	const mint_t *root = info.RootTable[0].elts();
	2687	mulmod_t qinv = info.qinv;
	2688	const FFTMultipliers& tab = info.bigtab->MulTab;
	2689
	2690	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2107	2691
2108	2692
2109	2693	long n = 1L << k;
2110	2694
2111		#ifndef NTL_BRC_TEST
2112		BitReverseCopy(AA, a, k);
	2695
	2696	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2697	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	2698
	2699	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2700	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	2701
	2702	new_mod_t mod;
	2703	mod.q = q;
	2704	mod.wtab = &wtab[0];
	2705	mod.wqinvtab = &wqinvtab[0];
	2706
	2707	mint_t two_inv = info.TwoInvTable[k];
	2708	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
	2709
	2710
	2711	#ifdef NTL_FFT_USEBUF
	2712	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2713	AA_store.SetLength(1L << k);
	2714	umint_t *AA = AA_store.elts();
	2715
	2716	for (long i = 0; i < n; i++) AA[i] = a[i];
	2717
	2718	new_fft_short_flipped(AA, k, mod);
	2719
	2720	for (long i = 0; i < n; i++) {
	2721	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2722	A[i] = LazyReduce1(tmp, q);
	2723	}
2113	2724	#else
2114		if (BRC_test_flag)
2115		for (long i = 0; i < n; i++) AA[i] = a[i];
2116		else
2117		BitReverseCopy(AA, a, k);
	2725	umint_t AA = (umint_t ) A;
	2726	if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
	2727
	2728	new_fft_short_flipped(AA, k, mod);
	2729
	2730	for (long i = 0; i < n; i++) {
	2731	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2732	AA[i] = LazyReduce1(tmp, q);
	2733	}
2118	2734	#endif
2119
2120
2121
2122		/* we work with redundant representations, in the range [0, 4q) */
2123
2124
2125
2126		long s, m, m_half, m_fourth, i, j;
2127		unsigned long t, u, t1, u1;
2128
2129
2130		// s = 1
2131		for (i = 0; i < n; i += 2) {
2132		t = AA[i + 1];
2133		u = AA[i];
2134		AA[i] = u + t;
2135		AA[i+1] = u - t + q;
2136		}
2137
2138
2139		// s = 2
2140		{
2141		const long * NTL_RESTRICT wtab = tab[2]->wtab_precomp.elts();
2142		const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[2]->wqinvtab_precomp.elts();
2143
2144		const long w1 = wtab[1];
2145		const mulmod_precon_t wqi1 = wqinvtab[1];
2146
2147		for (i = 0; i < n; i += 4) {
2148
2149		unsigned long * NTL_RESTRICT AA0 = &AA[i];
2150		unsigned long * NTL_RESTRICT AA1 = &AA[i + 2];
2151
2152		{
2153		const unsigned long a11 = AA1[0];
2154		const unsigned long a01 = AA0[0];
2155
2156		const unsigned long tt1 = a11;
2157		const unsigned long uu1 = a01;
2158		const unsigned long b01 = uu1 + tt1;
2159		const unsigned long b11 = uu1 - tt1 + 2*q;
2160
2161		AA0[0] = b01;
2162		AA1[0] = b11;
	2735	}
	2736
	2737	//======= Inverse FFT with tables ==============
	2738
	2739
	2740	void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	2741	long yn)
	2742
	2743	// Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
	2744	// yn==xn are as described above in the truncated FFT interface.
	2745	// Both A and a should point to arrays of size 2^k,
	2746	// and should either be the same or not overlap at all.
	2747	// Multiples by 2^{-k}.
	2748
	2749	{
	2750	if (!info.bigtab \|\| k > info.bigtab->bound) {
	2751	new_ifft_notab(A, a, k, info, yn);
	2752	return;
	2753	}
	2754
	2755	mint_t q = info.q;
	2756
	2757	if (k <= 1) {
	2758	if (k == 0) {
	2759	A[0] = a[0];
	2760	return;
	2761	}
	2762	if (k == 1) {
	2763	mint_t two_inv = info.TwoInvTable[1];
	2764	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
	2765	mint_t A0 = AddMod(a[0], a[1], q);
	2766	mint_t A1 = SubMod(a[0], a[1], q);
	2767	A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
	2768	A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
	2769	return;
	2770	}
	2771	}
	2772
	2773	// assume k > 1
	2774	const mint_t *root = info.RootTable[0].elts();
	2775	mulmod_t qinv = info.qinv;
	2776	const FFTMultipliers& tab = info.bigtab->MulTab;
	2777
	2778	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	2779
	2780
	2781	long n = 1L << k;
	2782
	2783
	2784	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2785	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	2786
	2787	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2788	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	2789
	2790	new_mod_t mod;
	2791	mod.q = q;
	2792	mod.wtab = &wtab[0];
	2793	mod.wqinvtab = &wqinvtab[0];
	2794
	2795	mint_t two_inv = info.TwoInvTable[k];
	2796	mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
	2797
	2798	#ifdef NTL_FFT_USEBUF
	2799	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2800	AA_store.SetLength(1L << k);
	2801	umint_t *AA = AA_store.elts();
	2802
	2803	for (long i = 0; i < yn; i++) AA[i] = a[i];
	2804
	2805	new_ifft_short1(AA, yn, k, mod);
	2806
	2807	for (long i = 0; i < yn; i++) {
	2808	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2809	A[i] = LazyReduce1(tmp, q);
	2810	}
	2811	#else
	2812	umint_t AA = (umint_t ) A;
	2813	if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
	2814
	2815	new_ifft_short1(AA, yn, k, mod);
	2816
	2817	for (long i = 0; i < yn; i++) {
	2818	umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
	2819	AA[i] = LazyReduce1(tmp, q);
	2820	}
	2821	#endif
	2822	}
	2823
	2824
	2825	void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
	2826
	2827
	2828	// Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
	2829	// Flipped means inverse roots are used an no truncation and
	2830	// no multiplication by 2^{-k}.
	2831	// Both A and a should point to arrays of size 2^k,
	2832	// and should either be the same or not overlap at all.
	2833
	2834
	2835	{
	2836	if (!info.bigtab \|\| k > info.bigtab->bound) {
	2837	new_ifft_flipped_notab(A, a, k, info);
	2838	return;
	2839	}
	2840
	2841	mint_t q = info.q;
	2842
	2843	if (k <= 1) {
	2844	if (k == 0) {
	2845	A[0] = a[0];
	2846	return;
	2847	}
	2848	if (k == 1) {
	2849	mint_t A0 = AddMod(a[0], a[1], q);
	2850	mint_t A1 = SubMod(a[0], a[1], q);
	2851	A[0] = A0;
	2852	A[1] = A1;
	2853	return;
	2854	}
	2855	}
	2856
	2857	// assume k > 1
	2858	const mint_t *root = info.RootTable[0].elts();
	2859	mulmod_t qinv = info.qinv;
	2860	const FFTMultipliers& tab = info.bigtab->MulTab;
	2861
	2862	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	2863
	2864
	2865	long n = 1L << k;
	2866
	2867
	2868	const mint_t *wtab[NTL_FFTMaxRoot+1];
	2869	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	2870
	2871	const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
	2872	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	2873
	2874	new_mod_t mod;
	2875	mod.q = q;
	2876	mod.wtab = &wtab[0];
	2877	mod.wqinvtab = &wqinvtab[0];
	2878
	2879
	2880	#ifdef NTL_FFT_USEBUF
	2881	NTL_TLS_GLOBAL_ACCESS(AA_store);
	2882	AA_store.SetLength(1L << k);
	2883	umint_t *AA = AA_store.elts();
	2884
	2885	for (long i = 0; i < n; i++) AA[i] = a[i];
	2886
	2887	new_ifft_short1_flipped(AA, k, mod);
	2888
	2889	for (long i = 0; i < n; i++) {
	2890	umint_t tmp = LazyReduce2(AA[i], q);
	2891	A[i] = LazyReduce1(tmp, q);
	2892	}
	2893	#else
	2894	umint_t AA = (umint_t ) A;
	2895	if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
	2896
	2897	new_ifft_short1_flipped(AA, k, mod);
	2898
	2899	for (long i = 0; i < n; i++) {
	2900	umint_t tmp = LazyReduce2(AA[i], q);
	2901	AA[i] = LazyReduce1(tmp, q);
	2902	}
	2903	#endif
	2904	}
	2905
	2906	#endif
	2907
	2908	//===============================================
	2909
	2910	void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index)
	2911	{
	2912	mulmod_t qinv = PrepMulMod(q);
	2913
	2914	long mr = CalcMaxRoot(q);
	2915
	2916	info.q = q;
	2917	info.qinv = qinv;
	2918	info.qrecip = 1/double(q);
	2919	info.zz_p_context = 0;
	2920
	2921
	2922	info.RootTable[0].SetLength(mr+1);
	2923	info.RootTable[1].SetLength(mr+1);
	2924	info.TwoInvTable.SetLength(mr+1);
	2925	info.TwoInvPreconTable.SetLength(mr+1);
	2926
	2927	long *rt = &info.RootTable[0][0];
	2928	long *rit = &info.RootTable[1][0];
	2929	long *tit = &info.TwoInvTable[0];
	2930	mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
	2931
	2932	long j;
	2933	long t;
	2934
	2935	rt[mr] = w;
	2936	for (j = mr-1; j >= 0; j--)
	2937	rt[j] = MulMod(rt[j+1], rt[j+1], q);
	2938
	2939	rit[mr] = InvMod(w, q);
	2940	for (j = mr-1; j >= 0; j--)
	2941	rit[j] = MulMod(rit[j+1], rit[j+1], q);
	2942
	2943	t = InvMod(2, q);
	2944	tit[0] = 1;
	2945	for (j = 1; j <= mr; j++)
	2946	tit[j] = MulMod(tit[j-1], t, q);
	2947
	2948	for (j = 0; j <= mr; j++)
	2949	tipt[j] = LazyPrepMulModPrecon(tit[j], q, qinv);
	2950
	2951	#ifndef NTL_ENABLE_AVX_FFT
	2952	if (bigtab_index != -1) {
	2953	long bound = NTL_FFT_BIGTAB_MAXROOT-bigtab_index/NTL_FFT_BIGTAB_LIMIT;
	2954	if (bound > NTL_FFT_BIGTAB_MINROOT) {
	2955	info.bigtab.make();
	2956	info.bigtab->bound = bound;
	2957	}
	2958	}
	2959	#else
	2960	// with the AVX implementation, we unconditionally use tables
	2961	info.bigtab.make();
	2962	#endif
	2963	}
	2964
	2965
	2966	//===================================================================
	2967
	2968	#ifdef NTL_ENABLE_AVX_FFT
	2969
	2970	static void
	2971	pd_LazyPrepMulModPrecon(double bninv, const double b, double n, long len)
	2972	{
	2973	CSRPush push;
	2974	pd_LazyPrepMulModPrecon_impl(bninv, b, n, len);
	2975	}
	2976
	2977	static
	2978	void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const pd_FFTMultipliers& tab)
	2979	{
	2980	if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
	2981
	2982	do { // NOTE: thread safe lazy init
	2983	pd_FFTMultipliers::Builder bld(tab, k+1);
	2984	long amt = bld.amt();
	2985	if (!amt) break;
	2986
	2987	long first = k+1-amt;
	2988	// initialize entries first..k
	2989
	2990
	2991	for (long s = first; s <= k; s++) {
	2992	UniquePtr<pd_FFTVectorPair> item;
	2993
	2994	if (s == 0) {
	2995	bld.move(item); // position 0 not used
	2996	continue;
2163	2997	}
2164		{
2165		const unsigned long a11 = AA1[1];
2166		const unsigned long a01 = AA0[1];
2167
2168		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2169		const unsigned long uu1 = a01;
2170		const unsigned long b01 = uu1 + tt1;
2171		const unsigned long b11 = uu1 - tt1 + 2*q;
2172
2173		AA0[1] = b01;
2174		AA1[1] = b11;
	2998
	2999	long m = 1L << s;
	3000	long m_half = 1L << (s-1);
	3001
	3002	item.make();
	3003	item->wtab_precomp.SetLength(m_half);
	3004	item->wqinvtab_precomp.SetLength(m_half);
	3005
	3006	double *wtab = item->wtab_precomp.elts();
	3007	double *wqinvtab = item->wqinvtab_precomp.elts();
	3008
	3009	mint_t w = root[s];
	3010	mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
	3011
	3012	mint_t wi = 1;
	3013	wtab[0] = wi;
	3014	for (long i = 1; i < m_half; i++) {
	3015	wi = MulModPrecon(wi, w, q, wqinv);
	3016	wtab[i] = wi;
2175	3017	}
2176		}
2177		}
2178
2179
2180		// s = 3..k
2181
2182		for (s = 3; s <= k; s++) {
2183		m = 1L << s;
2184		m_half = 1L << (s-1);
2185		m_fourth = 1L << (s-2);
2186
2187		const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
2188		const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
2189
2190		for (i = 0; i < n; i += m) {
2191
2192		unsigned long * NTL_RESTRICT AA0 = &AA[i];
2193		unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
2194
2195		#if 1
2196
2197		// a little loop unrolling: this gives the best code
2198
2199		for (j = 0; j < m_half; j += 4) {
2200		{
2201		const long w1 = wtab[j+0];
2202		const mulmod_precon_t wqi1 = wqinvtab[j+0];
2203		const unsigned long a11 = AA1[j+0];
2204		const unsigned long a01 = AA0[j+0];
2205
2206		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2207		const unsigned long uu1 = LazyReduce2(a01, q);
2208		const unsigned long b01 = uu1 + tt1;
2209		const unsigned long b11 = uu1 - tt1 + 2*q;
2210
2211		AA0[j+0] = b01;
2212		AA1[j+0] = b11;
2213		}
2214		{
2215		const long w1 = wtab[j+1];
2216		const mulmod_precon_t wqi1 = wqinvtab[j+1];
2217		const unsigned long a11 = AA1[j+1];
2218		const unsigned long a01 = AA0[j+1];
2219
2220		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2221		const unsigned long uu1 = LazyReduce2(a01, q);
2222		const unsigned long b01 = uu1 + tt1;
2223		const unsigned long b11 = uu1 - tt1 + 2*q;
2224
2225		AA0[j+1] = b01;
2226		AA1[j+1] = b11;
2227		}
2228		{
2229		const long w1 = wtab[j+2];
2230		const mulmod_precon_t wqi1 = wqinvtab[j+2];
2231		const unsigned long a11 = AA1[j+2];
2232		const unsigned long a01 = AA0[j+2];
2233
2234		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2235		const unsigned long uu1 = LazyReduce2(a01, q);
2236		const unsigned long b01 = uu1 + tt1;
2237		const unsigned long b11 = uu1 - tt1 + 2*q;
2238
2239		AA0[j+2] = b01;
2240		AA1[j+2] = b11;
2241		}
2242		{
2243		const long w1 = wtab[j+3];
2244		const mulmod_precon_t wqi1 = wqinvtab[j+3];
2245		const unsigned long a11 = AA1[j+3];
2246		const unsigned long a01 = AA0[j+3];
2247
2248		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2249		const unsigned long uu1 = LazyReduce2(a01, q);
2250		const unsigned long b01 = uu1 + tt1;
2251		const unsigned long b11 = uu1 - tt1 + 2*q;
2252
2253		AA0[j+3] = b01;
2254		AA1[j+3] = b11;
2255		}
2256		}
2257
2258		#else
2259
2260		// a plain loop: not as good as the unrolled version
2261
2262		for (j = 0; j < m_half; j++) {
2263		const long w1 = wtab[j];
2264		const mulmod_precon_t wqi1 = wqinvtab[j];
2265		const unsigned long a11 = AA1[j];
2266		const unsigned long a01 = AA0[j];
2267
2268		const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2269		const unsigned long uu1 = LazyReduce2(a01, q);
2270		const unsigned long b01 = uu1 + tt1;
2271		const unsigned long b11 = uu1 - tt1 + 2*q;
2272
2273		AA0[j] = b01;
2274		AA1[j] = b11;
2275		}
	3018	pd_LazyPrepMulModPrecon(wqinvtab, wtab, q, m_half);
	3019
	3020	bld.move(item);
	3021	}
	3022	} while (0);
	3023	}
	3024
	3025	NTL_TLS_GLOBAL_DECL(AlignedArray<double>, pd_AA_store)
	3026	static NTL_CHEAP_THREAD_LOCAL long pd_AA_store_len = 0;
	3027
	3028
	3029	#define PD_MIN_K (NTL_LG2_PDSZ+3)
	3030	// k must be at least PD_MIN_K
	3031
	3032	void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	3033	long yn, long xn)
	3034	{
	3035	if (k < PD_MIN_K) {
	3036	new_fft_notab(A, a, k, info, yn, xn);
	3037	return;
	3038	}
	3039
	3040	long dir = 0;
	3041
	3042	mint_t q = info.q;
	3043	const mint_t *root = info.RootTable[dir].elts();
	3044	mulmod_t qinv = info.qinv;
	3045	const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
	3046
	3047	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	3048
	3049	const double *wtab[NTL_FFTMaxRoot+1];
	3050	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	3051
	3052	const double *wqinvtab[NTL_FFTMaxRoot+1];
	3053	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	3054
	3055	pd_mod_t mod;
	3056	mod.q = q;
	3057	mod.wtab = &wtab[0];
	3058	mod.wqinvtab = &wqinvtab[0];
	3059
	3060	long n = 1L << k;
	3061
	3062	NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
	3063	if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
	3064	double *AA = pd_AA_store.elts();
	3065
	3066	CSRPush push;
	3067	pd_fft_trunc_impl(A, a, AA, k, mod, yn, xn);
	3068	}
	3069
	3070
	3071
	3072	void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
	3073	{
	3074	if (k < PD_MIN_K) {
	3075	new_fft_flipped_notab(A, a, k, info);
	3076	return;
	3077	}
	3078
	3079	long dir = 1;
	3080
	3081	mint_t q = info.q;
	3082	const mint_t *root = info.RootTable[dir].elts();
	3083	mulmod_t qinv = info.qinv;
	3084	const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
	3085
	3086	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	3087
	3088	const double *wtab[NTL_FFTMaxRoot+1];
	3089	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	3090
	3091	const double *wqinvtab[NTL_FFTMaxRoot+1];
	3092	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	3093
	3094	pd_mod_t mod;
	3095	mod.q = q;
	3096	mod.wtab = &wtab[0];
	3097	mod.wqinvtab = &wqinvtab[0];
	3098
	3099	long n = 1L << k;
	3100
	3101	NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
	3102	if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
	3103	double *AA = pd_AA_store.elts();
	3104
	3105	CSRPush push;
	3106	pd_fft_trunc_impl(A, a, AA, k, mod, n, n, info.TwoInvTable[k]);
	3107	}
	3108
	3109
	3110	void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
	3111	long yn)
	3112	{
	3113	if (k < PD_MIN_K) {
	3114	new_ifft_notab(A, a, k, info, yn);
	3115	return;
	3116	}
	3117
	3118	long dir = 0;
	3119
	3120	mint_t q = info.q;
	3121	const mint_t *root = info.RootTable[1-dir].elts();
	3122	const mint_t *root1 = info.RootTable[dir].elts();
	3123	mulmod_t qinv = info.qinv;
	3124	const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
	3125	const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
	3126
	3127	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	3128	if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
	3129
	3130	const double *wtab[NTL_FFTMaxRoot+1];
	3131	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	3132
	3133	const double *wqinvtab[NTL_FFTMaxRoot+1];
	3134	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	3135
	3136	const double *wtab1[NTL_FFTMaxRoot+1];
	3137	for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
	3138
	3139	const double *wqinvtab1[NTL_FFTMaxRoot+1];
	3140	for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
	3141
	3142	pd_mod_t mod;
	3143	mod.q = q;
	3144	mod.wtab = &wtab[0];
	3145	mod.wqinvtab = &wqinvtab[0];
	3146	mod.wtab1 = &wtab1[0];
	3147	mod.wqinvtab1 = &wqinvtab1[0];
	3148
	3149	long n = 1L << k;
	3150
	3151	NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
	3152	if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
	3153	double *AA = pd_AA_store.elts();
	3154
	3155	CSRPush push;
	3156	pd_ifft_trunc_impl(A, a, AA, k, mod, yn, info.TwoInvTable[k]);
	3157	}
	3158
	3159
	3160	void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
	3161	{
	3162	if (k < PD_MIN_K) {
	3163	new_ifft_flipped_notab(A, a, k, info);
	3164	return;
	3165	}
	3166
	3167	long dir = 1;
	3168
	3169	mint_t q = info.q;
	3170	const mint_t *root = info.RootTable[1-dir].elts();
	3171	const mint_t *root1 = info.RootTable[dir].elts();
	3172	mulmod_t qinv = info.qinv;
	3173	const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
	3174	const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
	3175
	3176	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
	3177	if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
	3178
	3179	const double *wtab[NTL_FFTMaxRoot+1];
	3180	for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
	3181
	3182	const double *wqinvtab[NTL_FFTMaxRoot+1];
	3183	for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
	3184
	3185	const double *wtab1[NTL_FFTMaxRoot+1];
	3186	for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
	3187
	3188	const double *wqinvtab1[NTL_FFTMaxRoot+1];
	3189	for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
	3190
	3191	pd_mod_t mod;
	3192	mod.q = q;
	3193	mod.wtab = &wtab[0];
	3194	mod.wqinvtab = &wqinvtab[0];
	3195	mod.wtab1 = &wtab1[0];
	3196	mod.wqinvtab1 = &wqinvtab1[0];
	3197
	3198	long n = 1L << k;
	3199
	3200	NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
	3201	if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
	3202	double *AA = pd_AA_store.elts();
	3203
	3204	CSRPush push;
	3205	pd_ifft_trunc_impl(A, a, AA, k, mod, n);
	3206	}
2276	3207
2277	3208	#endif
2278	3209
2279		}
2280		}
2281
2282		/* need to reduce redundant representations */
2283
2284		for (i = 0; i < n; i++) {
2285		unsigned long tmp = LazyReduce2(AA[i], q);
2286		A[i] = LazyReduce1(tmp, q);
2287		}
2288		}
2289
2290
2291
2292
2293
2294		#endif
2295
2296
2297
2298
2299	3210
2300	3211	NTL_END_IMPL

+202

-44

src/GF2E.cpp less more

13	13	GF2EInfoT::GF2EInfoT(const GF2X& NewP)
14	14	{
15	15	build(p, NewP);
16
17		if (p.size == 1) {
18		if (deg(p) <= NTL_BITS_PER_LONG/2)
	16	_card_exp = p.n;
	17
	18	long sz = p.size;
	19
	20	// The following crossovers were set using the programs
	21	// GF2EXKarCross.cpp, GF2EXModCross.cpp, GF2EXModCross.cpp,
	22	// and GF2EXGCDCross.cpp.
	23	// To use these programs, one has to remove the #if 0 guards
	24	// in GF2EX.cpp on mul_disable_plain, BuildPlain, and DivRemPlain.
	25
	26	// There are three different configurations that are treated separately:
	27	// * with gf2x lib and with pclmul instruction available
	28	// * without gf2x lib but with pclmul
	29	// * without gf2X lib and without pclmul
	30	// It is possible that one could be using gf2x lib on a platform without
	31	// pclmul, in which case the crossovers used here are not optimal. It is also
	32	// possible that one could be using gf2x lib with pclmul, but compile NTL with
	33	// NATIVE=off, so that NTL assumes there is no pclmul. Again, this will lead
	34	// to crossovers that are not optimal.
	35
	36	// The crossovers were calculated based on a Skylake Xeon processor:
	37	// Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz.
	38
	39
	40	#if (defined(NTL_GF2X_LIB) && defined(NTL_HAVE_PCLMUL))
	41
	42	//========== KarCross ==========
	43
	44	if (sz <= 1) {
	45	if (deg(p) <= NTL_BITS_PER_LONG/2)
	46	KarCross = 3;
	47	else
19	48	KarCross = 4;
	49	}
	50	else if (sz <= 6) KarCross = 8;
	51	else if (sz <= 9) KarCross = 4;
	52	else KarCross = 2;
	53
	54
	55
	56	//========== ModCross ==========
	57
	58
	59	if (sz <= 1) {
	60	if (deg(p) <= NTL_BITS_PER_LONG/2)
	61	ModCross = 15;
	62	else
	63	ModCross = 20;
	64	}
	65	else if (sz <= 9) ModCross = 60;
	66	else if (sz <= 18) ModCross = 25;
	67	else ModCross = 15;
	68
	69
	70	//========== DivCross ==========
	71
	72	if (sz <= 1) {
	73	if (deg(p) <= NTL_BITS_PER_LONG/2)
	74	DivCross = 50;
	75	else
	76	DivCross = 75;
	77	}
	78	else if (sz <= 2) DivCross = 100;
	79	else if (sz <= 3) DivCross = 150;
	80	else if (sz <= 4) DivCross = 200;
	81	else if (sz <= 6) DivCross = 250;
	82	else if (sz <= 9) DivCross = 225;
	83	else if (sz <= 15) DivCross = 125;
	84	else if (sz < 125) DivCross = 100;
	85	else DivCross = 75;
	86
	87	//========== GCDCross ==========
	88
	89	if (sz <= 1) {
	90	if (deg(p) <= NTL_BITS_PER_LONG/2)
	91	GCDCross = 225;
	92	else
	93	GCDCross = 225;
	94	}
	95	else if (sz <= 2) GCDCross = 450;
	96	else if (sz <= 4) GCDCross = 600;
	97	else if (sz < 12) GCDCross = 1150;
	98	else GCDCross = 600;
	99
	100
	101	#elif (defined(NTL_HAVE_PCLMUL))
	102
	103	//========== KarCross ==========
	104
	105	if (sz <= 1) {
	106	if (deg(p) <= NTL_BITS_PER_LONG/2)
	107	KarCross = 5;
20	108	else
21	109	KarCross = 8;
22	110	}
23		else if (p.size == 2)
24		KarCross = 8;
25		else if (p.size <= 5)
26		KarCross = 4;
27		else if (p.size == 6)
28		KarCross = 3;
29		else
30		KarCross = 2;
31
32
33		if (p.size <= 1) {
34		if (deg(p) <= NTL_BITS_PER_LONG/2)
35		ModCross = 20;
36		else
37		ModCross = 40;
38		}
39		else if (p.size <= 2)
40		ModCross = 75;
41		else if (p.size <= 4)
42		ModCross = 50;
43		else
44		ModCross = 25;
45
46		if (p.size == 1) {
47		if (deg(p) <= NTL_BITS_PER_LONG/2)
48		DivCross = 100;
49		else
50		DivCross = 200;
51		}
52		else if (p.size == 2)
53		DivCross = 400;
54		else if (p.size <= 4)
55		DivCross = 200;
56		else if (p.size == 5)
57		DivCross = 150;
58		else if (p.size <= 13)
59		DivCross = 100;
60		else
61		DivCross = 75;
62
63		_card_exp = p.n;
	111	else if (sz <= 5) KarCross = 8;
	112	else if (sz <= 9) KarCross = 4;
	113	else KarCross = 2;
	114
	115
	116
	117	//========== ModCross ==========
	118
	119
	120	if (sz <= 1) {
	121	if (deg(p) <= NTL_BITS_PER_LONG/2)
	122	ModCross = 30;
	123	else
	124	ModCross = 45;
	125	}
	126	else if (sz <= 2) ModCross = 110;
	127	else if (sz <= 3) ModCross = 105;
	128	else if (sz <= 4) ModCross = 65;
	129	else if (sz <= 5) ModCross = 60;
	130	else if (sz <= 6) ModCross = 55;
	131	else if (sz <= 8) ModCross = 50;
	132	else if (sz <= 12) ModCross = 30;
	133	else if (sz <= 18) ModCross = 25;
	134	else ModCross = 15;
	135
	136
	137
	138	//========== DivCross ==========
	139
	140
	141	if (sz <= 1) {
	142	if (deg(p) <= NTL_BITS_PER_LONG/2)
	143	DivCross = 75;
	144	else
	145	DivCross = 125;
	146	}
	147	else if (sz <= 2) DivCross = 450;
	148	else if (sz <= 3) DivCross = 425;
	149	else if (sz <= 4) DivCross = 375;
	150	else if (sz <= 6) DivCross = 250;
	151	else if (sz <= 8) DivCross = 225;
	152	else if (sz <= 16) DivCross = 125;
	153	else if (sz <= 45) DivCross = 100;
	154	else DivCross = 75;
	155
	156
	157	//========== GCDCross ==========
	158
	159	if (sz <= 1) {
	160	if (deg(p) <= NTL_BITS_PER_LONG/2)
	161	GCDCross = 225;
	162	else
	163	GCDCross = 225;
	164	}
	165	else if (sz < 12) GCDCross = 1150;
	166	else GCDCross = 850;
	167
	168	#else
	169
	170	//========== KarCross ==========
	171
	172	if (sz <= 1) {
	173	if (deg(p) <= NTL_BITS_PER_LONG/2)
	174	KarCross = 4;
	175	else
	176	KarCross = 12;
	177	}
	178	else if (sz <= 3) KarCross = 4;
	179	else KarCross = 2;
	180
	181
	182
	183	//========== ModCross ==========
	184
	185
	186	if (sz <= 1) {
	187	if (deg(p) <= NTL_BITS_PER_LONG/2)
	188	ModCross = 45;
	189	else
	190	ModCross = 65;
	191	}
	192	else if (sz <= 2) ModCross = 25;
	193	else ModCross = 15;
	194
	195
	196	//========== DivCross ==========
	197
	198	if (sz <= 1) {
	199	if (deg(p) <= NTL_BITS_PER_LONG/2)
	200	DivCross = 175;
	201	else
	202	DivCross = 250;
	203	}
	204	else if (sz <= 4) DivCross = 100;
	205	else DivCross = 75;
	206
	207	//========== GCDCross ==========
	208
	209	if (sz <= 1) {
	210	if (deg(p) <= NTL_BITS_PER_LONG/2)
	211	GCDCross = 225;
	212	else
	213	GCDCross = 850;
	214	}
	215	else if (sz < 8) GCDCross = 850;
	216	else if (sz < 12) GCDCross = 600;
	217	else GCDCross = 450;
	218
	219
	220	#endif
	221
64	222	}
65	223
66	224

+519

-53

src/GF2EX.cpp less more

766	766	return;
767	767	}
768	768
769		if (GF2E::WordLength() <= 1) {
	769	bool use_kron_mul = false;
	770
	771	if (GF2E::WordLength() <= 1) use_kron_mul = true;
	772
	773	#if (defined(NTL_GF2X_LIB) && defined(NTL_HAVE_PCLMUL))
	774	// With gf2x library and pclmul, KronMul is better in a larger range, but
	775	// it is very hard to characterize that range. The following is very
	776	// conservative.
	777
	778	if (GF2E::WordLength() <= 4 && sa >= 50 && sb >= 50) use_kron_mul = true;
	779	// FIXME: figure out a larger range where KronMul is better
	780	// (and don't forget to recompute crossovers in GF2E.cpp).
	781	#endif
	782
	783
	784	if (use_kron_mul) {
770	785	KronMul(c, a, b);
771	786	return;
772	787	}

807	822	}
808	823
809	824
	825
	826	#if 0
	827	// used only for computing KarCross using GF2EXKarCross.cpp
	828	void mul_disable_plain(GF2EX& c, const GF2EX& a, const GF2EX& b)
	829	{
	830	if (IsZero(a) \|\| IsZero(b)) {
	831	clear(c);
	832	return;
	833	}
	834
	835	if (&a == &b) {
	836	sqr(c, a);
	837	return;
	838	}
	839
	840	long sa = a.rep.length();
	841	long sb = b.rep.length();
	842
	843	if (sa == 1) {
	844	mul(c, b, a.rep[0]);
	845	return;
	846	}
	847
	848	if (sb == 1) {
	849	mul(c, a, b.rep[0]);
	850	return;
	851	}
	852
	853	if (0) {
	854	//if (sa < GF2E::KarCross() \|\| sb < GF2E::KarCross()) {
	855	PlainMul(c, a, b);
	856	return;
	857	}
	858
	859	if (GF2E::WordLength() <= 1) {
	860	KronMul(c, a, b);
	861	return;
	862	}
	863
	864
	865	/* karatsuba */
	866
	867	long n, hn, sp;
	868
	869	n = max(sa, sb);
	870	sp = 0;
	871	do {
	872	hn = (n+1) >> 1;
	873	sp += (hn << 2) - 1;
	874	n = hn;
	875	} while (n > 1);
	876
	877	GF2XVec stk;
	878	stk.SetSize(sp + 2(sa+sb)-1, 2GF2E::WordLength());
	879
	880	long i;
	881
	882	for (i = 0; i < sa; i++)
	883	stk[i+sa+sb-1] = rep(a.rep[i]);
	884
	885	for (i = 0; i < sb; i++)
	886	stk[i+2*sa+sb-1] = rep(b.rep[i]);
	887
	888	KarMul(&stk[0], &stk[sa+sb-1], sa, &stk[2*sa+sb-1], sb,
	889	&stk[2*(sa+sb)-1]);
	890
	891	c.rep.SetLength(sa+sb-1);
	892
	893	for (i = 0; i < sa+sb-1; i++)
	894	conv(c.rep[i], stk[i]);
	895
	896	c.normalize();
	897	}
	898	#endif
	899
	900
	901
	902
	903
810	904	void MulTrunc(GF2EX& x, const GF2EX& a, const GF2EX& b, long n)
811	905	{
812	906	GF2EX t;

1173	1267	}
1174	1268
1175	1269
1176		void GCD(GF2EX& x, const GF2EX& a, const GF2EX& b)
	1270	void PlainGCD(GF2EX& x, const GF2EX& a, const GF2EX& b)
1177	1271	{
1178	1272	GF2E t;
1179	1273

1206	1300	mul(x, x, t);
1207	1301	}
1208	1302
	1303	class _NTL_GF2EXMatrix {
	1304	private:
	1305
	1306	_NTL_GF2EXMatrix(const _NTL_GF2EXMatrix&); // disable
	1307	GF2EX elts[2][2];
	1308
	1309	public:
	1310
	1311	_NTL_GF2EXMatrix() { }
	1312	~_NTL_GF2EXMatrix() { }
	1313
	1314	void operator=(const _NTL_GF2EXMatrix&);
	1315	GF2EX& operator() (long i, long j) { return elts[i][j]; }
	1316	const GF2EX& operator() (long i, long j) const { return elts[i][j]; }
	1317	};
	1318
	1319
	1320	void _NTL_GF2EXMatrix::operator=(const _NTL_GF2EXMatrix& M)
	1321	{
	1322	elts[0][0] = M.elts[0][0];
	1323	elts[0][1] = M.elts[0][1];
	1324	elts[1][0] = M.elts[1][0];
	1325	elts[1][1] = M.elts[1][1];
	1326	}
	1327
	1328
	1329	static
	1330	void mul(GF2EX& U, GF2EX& V, const _NTL_GF2EXMatrix& M)
	1331	// (U, V)^T = M*(U, V)^T
	1332	{
	1333	GF2EX t1, t2, t3;
	1334
	1335	mul(t1, M(0,0), U);
	1336	mul(t2, M(0,1), V);
	1337	add(t3, t1, t2);
	1338	mul(t1, M(1,0), U);
	1339	mul(t2, M(1,1), V);
	1340	add(V, t1, t2);
	1341	U = t3;
	1342	}
	1343
	1344
	1345	static
	1346	void mul(_NTL_GF2EXMatrix& A, _NTL_GF2EXMatrix& B, _NTL_GF2EXMatrix& C)
	1347	// A = B*C, B and C are destroyed
	1348	{
	1349	GF2EX t1, t2;
	1350
	1351	mul(t1, B(0,0), C(0,0));
	1352	mul(t2, B(0,1), C(1,0));
	1353	add(A(0,0), t1, t2);
	1354
	1355	mul(t1, B(1,0), C(0,0));
	1356	mul(t2, B(1,1), C(1,0));
	1357	add(A(1,0), t1, t2);
	1358
	1359	mul(t1, B(0,0), C(0,1));
	1360	mul(t2, B(0,1), C(1,1));
	1361	add(A(0,1), t1, t2);
	1362
	1363	mul(t1, B(1,0), C(0,1));
	1364	mul(t2, B(1,1), C(1,1));
	1365	add(A(1,1), t1, t2);
	1366
	1367	long i, j;
	1368	for (i = 0; i < 2; i++) {
	1369	for (j = 0; j < 2; j++) {
	1370	B(i,j).kill();
	1371	C(i,j).kill();
	1372	}
	1373	}
	1374	}
	1375
	1376
	1377	void IterHalfGCD(_NTL_GF2EXMatrix& M_out, GF2EX& U, GF2EX& V, long d_red)
	1378	{
	1379	M_out(0,0).SetMaxLength(d_red);
	1380	M_out(0,1).SetMaxLength(d_red);
	1381	M_out(1,0).SetMaxLength(d_red);
	1382	M_out(1,1).SetMaxLength(d_red);
	1383
	1384	set(M_out(0,0)); clear(M_out(0,1));
	1385	clear(M_out(1,0)); set(M_out(1,1));
	1386
	1387	long goal = deg(U) - d_red;
	1388
	1389	if (deg(V) <= goal)
	1390	return;
	1391
	1392	GF2EX Q, t(INIT_SIZE, d_red);
	1393
	1394	while (deg(V) > goal) {
	1395	PlainDivRem(Q, U, U, V);
	1396	swap(U, V);
	1397
	1398	mul(t, Q, M_out(1,0));
	1399	sub(t, M_out(0,0), t);
	1400	M_out(0,0) = M_out(1,0);
	1401	M_out(1,0) = t;
	1402
	1403	mul(t, Q, M_out(1,1));
	1404	sub(t, M_out(0,1), t);
	1405	M_out(0,1) = M_out(1,1);
	1406	M_out(1,1) = t;
	1407	}
	1408	}
	1409
	1410
	1411	#define NTL_GF2EX_HalfGCD_CROSSOVER (40)
	1412
	1413
	1414	void HalfGCD(_NTL_GF2EXMatrix& M_out, const GF2EX& U, const GF2EX& V, long d_red)
	1415	{
	1416	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	1417	set(M_out(0,0)); clear(M_out(0,1));
	1418	clear(M_out(1,0)); set(M_out(1,1));
	1419
	1420	return;
	1421	}
	1422
	1423
	1424	long n = deg(U) - 2*d_red + 2;
	1425	if (n < 0) n = 0;
	1426
	1427	GF2EX U1, V1;
	1428
	1429	RightShift(U1, U, n);
	1430	RightShift(V1, V, n);
	1431
	1432	if (d_red <= NTL_GF2EX_HalfGCD_CROSSOVER) {
	1433	IterHalfGCD(M_out, U1, V1, d_red);
	1434	return;
	1435	}
	1436
	1437	long d1 = (d_red + 1)/2;
	1438	if (d1 < 1) d1 = 1;
	1439	if (d1 >= d_red) d1 = d_red - 1;
	1440
	1441	_NTL_GF2EXMatrix M1;
	1442
	1443	HalfGCD(M1, U1, V1, d1);
	1444	mul(U1, V1, M1);
	1445
	1446	long d2 = deg(V1) - deg(U) + n + d_red;
	1447
	1448	if (IsZero(V1) \|\| d2 <= 0) {
	1449	M_out = M1;
	1450	return;
	1451	}
	1452
	1453
	1454	GF2EX Q;
	1455	_NTL_GF2EXMatrix M2;
	1456
	1457	DivRem(Q, U1, U1, V1);
	1458	swap(U1, V1);
	1459
	1460	HalfGCD(M2, U1, V1, d2);
	1461
	1462	GF2EX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	1463
	1464	mul(t, Q, M1(1,0));
	1465	sub(t, M1(0,0), t);
	1466	swap(M1(0,0), M1(1,0));
	1467	swap(M1(1,0), t);
	1468
	1469	t.kill();
	1470
	1471	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	1472
	1473	mul(t, Q, M1(1,1));
	1474	sub(t, M1(0,1), t);
	1475	swap(M1(0,1), M1(1,1));
	1476	swap(M1(1,1), t);
	1477
	1478	t.kill();
	1479
	1480	mul(M_out, M2, M1);
	1481	}
	1482
	1483	void XHalfGCD(_NTL_GF2EXMatrix& M_out, GF2EX& U, GF2EX& V, long d_red)
	1484	{
	1485	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	1486	set(M_out(0,0)); clear(M_out(0,1));
	1487	clear(M_out(1,0)); set(M_out(1,1));
	1488
	1489	return;
	1490	}
	1491
	1492	long du = deg(U);
	1493
	1494	if (d_red <= NTL_GF2EX_HalfGCD_CROSSOVER) {
	1495	IterHalfGCD(M_out, U, V, d_red);
	1496	return;
	1497	}
	1498
	1499	long d1 = (d_red + 1)/2;
	1500	if (d1 < 1) d1 = 1;
	1501	if (d1 >= d_red) d1 = d_red - 1;
	1502
	1503	//ZZ_pXMatrix M1;
	1504	_NTL_GF2EXMatrix M1;
	1505
	1506	HalfGCD(M1, U, V, d1);
	1507	mul(U, V, M1);
	1508
	1509	long d2 = deg(V) - du + d_red;
	1510
	1511	if (IsZero(V) \|\| d2 <= 0) {
	1512	M_out = M1;
	1513	return;
	1514	}
	1515
	1516
	1517	GF2EX Q;
	1518	_NTL_GF2EXMatrix M2;
	1519
	1520	DivRem(Q, U, U, V);
	1521	swap(U, V);
	1522
	1523	XHalfGCD(M2, U, V, d2);
	1524
	1525	GF2EX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	1526
	1527	mul(t, Q, M1(1,0));
	1528	sub(t, M1(0,0), t);
	1529	swap(M1(0,0), M1(1,0));
	1530	swap(M1(1,0), t);
	1531
	1532	t.kill();
	1533
	1534	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	1535
	1536	mul(t, Q, M1(1,1));
	1537	sub(t, M1(0,1), t);
	1538	swap(M1(0,1), M1(1,1));
	1539	swap(M1(1,1), t);
	1540
	1541	t.kill();
	1542
	1543	mul(M_out, M2, M1);
	1544	}
	1545
	1546	void HalfGCD(GF2EX& U, GF2EX& V)
	1547	{
	1548	long d_red = (deg(U)+1)/2;
	1549
	1550	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	1551	return;
	1552	}
	1553
	1554	long du = deg(U);
	1555
	1556
	1557	long d1 = (d_red + 1)/2;
	1558	if (d1 < 1) d1 = 1;
	1559	if (d1 >= d_red) d1 = d_red - 1;
	1560
	1561	_NTL_GF2EXMatrix M1;
	1562
	1563	HalfGCD(M1, U, V, d1);
	1564	mul(U, V, M1);
	1565
	1566	long d2 = deg(V) - du + d_red;
	1567
	1568	if (IsZero(V) \|\| d2 <= 0) {
	1569	return;
	1570	}
	1571
	1572	M1(0,0).kill();
	1573	M1(0,1).kill();
	1574	M1(1,0).kill();
	1575	M1(1,1).kill();
	1576
	1577
	1578	GF2EX Q;
	1579
	1580	DivRem(Q, U, U, V);
	1581	swap(U, V);
	1582
	1583	HalfGCD(M1, U, V, d2);
	1584
	1585	mul(U, V, M1);
	1586	}
	1587
	1588
	1589	void GCD(GF2EX& d, const GF2EX& u, const GF2EX& v)
	1590	{
	1591	GF2EX u1, v1;
	1592
	1593	u1 = u;
	1594	v1 = v;
	1595
	1596	if (deg(u1) == deg(v1)) {
	1597	if (IsZero(u1)) {
	1598	clear(d);
	1599	return;
	1600	}
	1601
	1602	rem(v1, v1, u1);
	1603	}
	1604	else if (deg(u1) < deg(v1)) {
	1605	swap(u1, v1);
	1606	}
	1607
	1608	// deg(u1) > deg(v1)
	1609
	1610	while (deg(u1) >= GF2E::GCDCross() && !IsZero(v1)) {
	1611	HalfGCD(u1, v1);
	1612
	1613	if (!IsZero(v1)) {
	1614	rem(u1, u1, v1);
	1615	swap(u1, v1);
	1616	}
	1617	}
	1618
	1619	PlainGCD(d, u1, v1);
	1620	}
1209	1621
1210	1622
1211	1623
1212	1624
	1625
1213	1626	void XGCD(GF2EX& d, GF2EX& s, GF2EX& t, const GF2EX& a, const GF2EX& b)
1214	1627	{
1215		GF2E z;
1216
1217
1218		if (IsZero(b)) {
	1628	GF2E w;
	1629
	1630	if (IsZero(a) && IsZero(b)) {
	1631	clear(d);
1219	1632	set(s);
1220	1633	clear(t);
1221		d = a;
1222		}
1223		else if (IsZero(a)) {
1224		clear(s);
1225		set(t);
1226		d = b;
1227		}
1228		else {
1229		long e = max(deg(a), deg(b)) + 1;
1230
1231		GF2EX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1232		u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1233		u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1234		u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1235
1236
1237		set(u1); clear(v1);
1238		clear(u2); set(v2);
1239		u = a; v = b;
1240
1241		do {
1242		DivRem(q, u, u, v);
1243		swap(u, v);
1244		u0 = u2;
1245		v0 = v2;
1246		mul(temp, q, u2);
1247		add(u2, u1, temp);
1248		mul(temp, q, v2);
1249		add(v2, v1, temp);
1250		u1 = u0;
1251		v1 = v0;
1252		} while (!IsZero(v));
1253
1254		d = u;
1255		s = u1;
1256		t = v1;
1257		}
1258
1259		if (IsZero(d)) return;
1260		if (IsOne(LeadCoeff(d))) return;
1261
1262		/* make gcd monic */
1263
1264		inv(z, LeadCoeff(d));
1265		mul(d, d, z);
1266		mul(s, s, z);
1267		mul(t, t, z);
	1634	return;
	1635	}
	1636
	1637	GF2EX U, V, Q;
	1638
	1639	U = a;
	1640	V = b;
	1641
	1642	long flag = 0;
	1643
	1644	if (deg(U) == deg(V)) {
	1645	DivRem(Q, U, U, V);
	1646	swap(U, V);
	1647	flag = 1;
	1648	}
	1649	else if (deg(U) < deg(V)) {
	1650	swap(U, V);
	1651	flag = 2;
	1652	}
	1653
	1654	_NTL_GF2EXMatrix M;
	1655
	1656	XHalfGCD(M, U, V, deg(U)+1);
	1657
	1658	d = U;
	1659
	1660	if (flag == 0) {
	1661	s = M(0,0);
	1662	t = M(0,1);
	1663	}
	1664	else if (flag == 1) {
	1665	s = M(0,1);
	1666	mul(t, Q, M(0,1));
	1667	sub(t, M(0,0), t);
	1668	}
	1669	else { /* flag == 2 */
	1670	s = M(0,1);
	1671	t = M(0,0);
	1672	}
	1673
	1674	// normalize
	1675
	1676	inv(w, LeadCoeff(d));
	1677	mul(d, d, w);
	1678	mul(s, s, w);
	1679	mul(t, t, w);
1268	1680	}
1269	1681
1270	1682

1558	1970	}
1559	1971	}
1560	1972
	1973
	1974	#if 0
	1975	// used only for computing ModCross using GF2EXModCross.cpp
	1976	void BuildPlain(GF2EXModulus& F, const GF2EX& f, bool plain)
	1977	{
	1978	long n = deg(f);
	1979
	1980	if (n <= 0) LogicError("build(GF2EXModulus,GF2EX): deg(f) <= 0");
	1981
	1982	if (NTL_OVERFLOW(n, GF2E::degree(), 0))
	1983	ResourceError("build(GF2EXModulus,GF2EX): overflow");
	1984
	1985	F.tracevec.make();
	1986
	1987	F.f = f;
	1988	F.n = n;
	1989
	1990	if (plain) {
	1991	F.method = GF2EX_MOD_PLAIN;
	1992	}
	1993	else {
	1994	F.method = GF2EX_MOD_MUL;
	1995	GF2EX P1;
	1996	GF2EX P2;
	1997
	1998	CopyReverse(P1, f, n);
	1999	InvTrunc(P2, P1, n-1);
	2000	CopyReverse(P1, P2, n-2);
	2001	trunc(F.h0, P1, n-2);
	2002	trunc(F.f0, f, n);
	2003	F.hlc = ConstTerm(P2);
	2004	}
	2005	}
	2006	#endif
	2007
1561	2008	GF2EXModulus::GF2EXModulus()
1562	2009	{
1563	2010	n = -1;

2056	2503	DivRem(q, r, a, B);
2057	2504	}
2058	2505	}
	2506
	2507	#if 0
	2508	// used only for computing DivCross using GF2EXDivCross.cpp
	2509	void DivRemPlain(GF2EX& q, GF2EX& r, const GF2EX& a, const GF2EX& b, bool plain)
	2510	{
	2511	long sa = a.rep.length();
	2512	long sb = b.rep.length();
	2513
	2514	if (plain)
	2515	PlainDivRem(q, r, a, b);
	2516	else if (sa < 4*sb)
	2517	UseMulDivRem(q, r, a, b);
	2518	else {
	2519	GF2EXModulus B;
	2520	build(B, b);
	2521	DivRem(q, r, a, B);
	2522	}
	2523	}
	2524	#endif
2059	2525
2060	2526	void div(GF2EX& q, const GF2EX& a, const GF2EX& b)
2061	2527	{

+59

-0

src/GF2EXDivCross.cpp less more

	0	#include <NTL/GF2EX.h>
	1	#include <NTL/GF2XFactoring.h>
	2
	3	namespace NTL {
	4
	5	void DivRemPlain(GF2EX& q, GF2EX& r, const GF2EX& a, const GF2EX& b, bool plain);
	6
	7	}
	8
	9	NTL_CLIENT
	10
	11
	12	#define TIME_IT(t, action) \
	13	do { \
	14	double _t0, _t1; \
	15	long _iter = 1; \
	16	long _cnt = 0; \
	17	do { \
	18	_t0 = GetTime(); \
	19	for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
	20	_t1 = GetTime(); \
	21	} while ( _t1 - _t0 < 2 && (_iter *= 2)); \
	22	t = (_t1 - _t0)/_iter; \
	23	} while(0)
	24
	25
	26	long test(long k)
	27	{
	28	GF2X P;
	29
	30	BuildIrred(P, k);
	31	GF2EPush push(P);
	32
	33	for (long n = 25; ; n+=25) {
	34	cerr << ",";
	35	GF2EX a, b, q, r;
	36	random(a, 2*n);
	37	random(b, n);
	38	double t1, t2;
	39	TIME_IT(t1, DivRemPlain(q, r, a, b, false));
	40	TIME_IT(t2, DivRemPlain(q, r, a, b, true));
	41	double t = t1/t2;
	42	if (t <= 0.95) return n;
	43	}
	44	}
	45
	46	int main()
	47	{
	48	cerr << "0.5 " << test(32) << "\n";
	49	for (long i = 1; i <= 50; i++) {
	50	cerr << i << " " << test(64*i) << "\n";
	51	}
	52
	53	for (long i = 75; i <= 200 ; i+=25) {
	54	cerr << i << " " << test(64*i) << "\n";
	55	}
	56	}
	57
	58

+83

-0

src/GF2EXGCDCross.cpp less more

	0	#include <NTL/GF2EX.h>
	1	#include <NTL/GF2XFactoring.h>
	2
	3	namespace NTL {
	4
	5	void HalfGCD(GF2EX&,GF2EX&);
	6	void PlainRem(GF2EX& r, const GF2EX& a, const GF2EX& b, GF2XVec& x);
	7
	8	}
	9
	10	NTL_CLIENT
	11
	12
	13	#define TIME_IT(t, action) \
	14	do { \
	15	double _t0, _t1; \
	16	long _iter = 1; \
	17	long _cnt = 0; \
	18	do { \
	19	_t0 = GetTime(); \
	20	for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
	21	_t1 = GetTime(); \
	22	} while ( _t1 - _t0 < 2 && (_iter *= 2)); \
	23	t = (_t1 - _t0)/_iter; \
	24	} while(0)
	25
	26
	27
	28	void TestGCD1(long bnd, const GF2EX& a, const GF2EX& b)
	29	{
	30
	31	long n = deg(a) + 1;
	32	GF2EX u(INIT_SIZE, n), v(INIT_SIZE, n);
	33	GF2XVec tmp(n, 2*GF2E::WordLength());
	34
	35	u = a;
	36	v = b;
	37	while (deg(v) > bnd) {
	38	PlainRem(u, u, v, tmp);
	39	swap(u, v);
	40	}
	41
	42	}
	43
	44
	45	long test(long k)
	46	{
	47	GF2X P;
	48
	49	BuildIrred(P, k);
	50	GF2EPush push(P);
	51
	52	for (long n = 42; ; n = long(n*1.4)) {
	53	cerr << ",";
	54	GF2EX d, a, b, u, v;
	55	random(a, n);
	56	SetCoeff(a, n);
	57	random(b, n);
	58	double t1, t2;
	59	TIME_IT(t1, u=a; v=b; HalfGCD(u, v));
	60	TIME_IT(t2, TestGCD1(deg(v), a, b));
	61	double t = t1/t2;
	62	if (t <= 1) return n;
	63	}
	64	}
	65
	66	int main()
	67	{
	68	#if 1
	69	cerr << "0.5 " << test(32) << "\n";
	70	for (long i = 1; i <= 4; i+=1) {
	71	cerr << i << " " << test(64*i) << "\n";
	72	}
	73	for (long i = 8; i <= 16; i+=4) {
	74	cerr << i << " " << test(64*i) << "\n";
	75	}
	76	#endif
	77	for (long i = 24; i <= 48; i+=8) {
	78	cerr << i << " " << test(64*i) << "\n";
	79	}
	80	}
	81
	82

+104

-0

src/GF2EXGCDTest.cpp less more

	0	#include <NTL/GF2XFactoring.h>
	1	#include <NTL/GF2EX.h>
	2
	3	NTL_CLIENT
	4
	5
	6
	7	void test(GF2X& P, GF2EX& f, GF2EX& g, GF2EX& h, GF2EX& hx, GF2EX& s, GF2EX& t)
	8	{
	9	/* P is the polynomial of the extension
	10	* f and g the polynomials
	11	* h the gcd
	12	* hx the gcd obtained using XGCD
	13	* s, t are Bezout coefficients hx=fs+gt
	14	*/
	15	GF2EX htest,rf,rg;
	16
	17	if (h!=hx){
	18	cout << P << "\n" << f << "\n" << g << "\n";
	19	Error("different gcd:\n");
	20	}
	21
	22	if (max(deg(f), deg(g)) > 0 \|\| min(deg(f), deg(g)) >= 0) {
	23	if (deg(s) >= deg(g) \|\| deg(t) >= deg(f)) {
	24	cout << P << "\n" << f << "\n" << g << "\n";
	25	Error("degree bounds at fault:\n");
	26	}
	27	}
	28
	29
	30	mul(s,s,f);
	31	mul(t,t,g);
	32	add(htest,t,s);
	33	if (h!=htest){
	34	cout << P << "\n" << f << "\n" << g << "\n";
	35	Error("xgcd at fault:\n");
	36	}
	37	if (!IsZero(h)){
	38	rem(rf,f,h);
	39	rem(rg,f,h);
	40	if ((!IsZero(rf))\|\|(!IsZero(rg))){
	41	cout << P << "\n" << f << "\n" << g << "\n";
	42	Error("not a common divisor\n");
	43	}
	44	}else{
	45	if (!IsZero(f) && !IsZero(g)){
	46	cout << "debug:\n";
	47	cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
	48	Error("ooops:\n");
	49	}
	50	}
	51	}
	52
	53
	54	int main()
	55	{
	56
	57	GF2X P;
	58
	59	BuildIrred(P, 128);
	60
	61	GF2E::init(P);
	62
	63	for (long i = 0; i < 400; i++) {
	64	if (i%10 == 0) cerr << ".";
	65	GF2EX f,g,h,s,t,hx;
	66
	67	long deg_h;
	68	if (RandomBnd(2))
	69	deg_h = RandomBnd(10)+1;
	70	else
	71	deg_h = RandomBnd(500)+1;
	72
	73	random(h, deg_h);
	74	SetCoeff(h, deg_h);
	75
	76	long deg_f;
	77	if (RandomBnd(2))
	78	deg_f = RandomBnd(10)+1;
	79	else
	80	deg_f = RandomBnd(1000)+1;
	81
	82	random(f, deg_f);
	83	f *= h;
	84
	85	long deg_g;
	86	if (RandomBnd(2))
	87	deg_g = RandomBnd(10)+1;
	88	else
	89	deg_g = RandomBnd(1000)+1;
	90
	91	random(g, deg_g);
	92	g *= h;
	93
	94	h = 0;
	95
	96	GCD(h, f, g);
	97	XGCD(hx, s, t, f, g);
	98	test(P, f, g, h, hx, s, t);
	99	}
	100
	101	cerr << "\n";
	102
	103	}

+56

-0

src/GF2EXKarCross.cpp less more

	0	#include <NTL/GF2EX.h>
	1	#include <NTL/GF2XFactoring.h>
	2
	3	namespace NTL {
	4
	5	void PlainMul(GF2EX&,const GF2EX&,const GF2EX&);
	6	void mul_disable_plain(GF2EX&,const GF2EX&,const GF2EX&);
	7
	8	}
	9
	10	NTL_CLIENT
	11
	12
	13	#define TIME_IT(t, action) \
	14	do { \
	15	double _t0, _t1; \
	16	long _iter = 1; \
	17	long _cnt = 0; \
	18	do { \
	19	_t0 = GetTime(); \
	20	for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
	21	_t1 = GetTime(); \
	22	} while ( _t1 - _t0 < 2 && (_iter *= 2)); \
	23	t = (_t1 - _t0)/_iter; \
	24	} while(0)
	25
	26
	27	long test(long k)
	28	{
	29	GF2X P;
	30
	31	BuildIrred(P, k);
	32	GF2EPush push(P);
	33
	34	for (long n = 2; ; n++) {
	35	cerr << ",";
	36	GF2EX a, b, c;
	37	random(a, n);
	38	random(b, n);
	39	double t1, t2;
	40	TIME_IT(t1, mul_disable_plain(c, a, b));
	41	TIME_IT(t2, PlainMul(c, a, b));
	42	double t = t1/t2;
	43	if (t <= 0.95) return n;
	44	}
	45	}
	46
	47	int main()
	48	{
	49	cerr << "0.5 " << test(32) << "\n";
	50	for (long i = 1; i <= 40; i++) {
	51	cerr << i << " " << test(64*i) << "\n";
	52	}
	53	}
	54
	55

+59

-0

src/GF2EXModCross.cpp less more

	0	#include <NTL/GF2EX.h>
	1	#include <NTL/GF2XFactoring.h>
	2
	3	namespace NTL {
	4
	5	void BuildPlain(GF2EXModulus& F, const GF2EX& f, bool plain);
	6
	7	}
	8
	9	NTL_CLIENT
	10
	11
	12	#define TIME_IT(t, action) \
	13	do { \
	14	double _t0, _t1; \
	15	long _iter = 1; \
	16	long _cnt = 0; \
	17	do { \
	18	_t0 = GetTime(); \
	19	for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
	20	_t1 = GetTime(); \
	21	} while ( _t1 - _t0 < 2 && (_iter *= 2)); \
	22	t = (_t1 - _t0)/_iter; \
	23	} while(0)
	24
	25
	26	long test(long k)
	27	{
	28	GF2X P;
	29
	30	BuildIrred(P, k);
	31	GF2EPush push(P);
	32
	33	for (long n = 5; ; n+=5) {
	34	cerr << ",";
	35	GF2EX a, r, f;
	36	random(a, 2*n-1);
	37	random(f, n);
	38	SetCoeff(f, n);
	39	GF2EXModulus F1, F2;
	40	BuildPlain(F1, f, false);
	41	BuildPlain(F2, f, true);
	42	double t1, t2;
	43	TIME_IT(t1, rem(r, a, F1));
	44	TIME_IT(t2, rem(r, a, F2));
	45	double t = t1/t2;
	46	if (t <= 0.95) return n;
	47	}
	48	}
	49
	50	int main()
	51	{
	52	cerr << "0.5 " << test(32) << "\n";
	53	for (long i = 1; i <= 40 ; i++) {
	54	cerr << i << " " << test(64*i) << "\n";
	55	}
	56	}
	57
	58

+13

-0

src/InitSettings.cpp less more

165	165	cout << "NTL_SAFE_VECTORS=0\n";
166	166	#endif
167	167
	168	#ifdef NTL_ENABLE_AVX_FFT
	169	cout << "NTL_ENABLE_AVX_FFT=1\n";
	170	#else
	171	cout << "NTL_ENABLE_AVX_FFT=0\n";
	172	#endif
	173
	174	#ifdef NTL_AVOID_AVX512
	175	cout << "NTL_AVOID_AVX512=1\n";
	176	#else
	177	cout << "NTL_AVOID_AVX512=0\n";
	178	#endif
	179
168	180	#ifdef NTL_RANGE_CHECK
169	181	cout << "NTL_RANGE_CHECK=1\n";
170	182	#else

172	184	#endif
173	185
174	186
	187
175	188	// the following are not actual config flags, but help
176	189	// in the Wizard logic
177	190

-0

src/NOTES less more

	0
0	1
1	2	-----------------------------
2	3	These are basically notes to myself on preparing a new

27	28
28	29	=====================================
29	30
	31	TODO: add a runtime flag that makes GetTime call GetWallTime
	32
30	33	FIXME: maybe it would make more sense to take the +1/-1 logic
31	34	out of [cg]_lip_impl block_construct routines and just put it in
32	35	the caller: the ZZ_p and ZZVec BlockConstruct stuff: add 1 there...

-1

src/QuadTest.cpp less more

8	8
9	9	quad_float::SetOutputPrecision(25);
10	10
11		if (PrecisionOK())
	11	long pok;
	12	double one = 1.0;
	13	quad_float_PrecisionOK(pok, one);
	14	if (pok)
12	15	cout << "Precision OK\n";
13	16	else
14	17	cout << "Precision not OK\n";

-0

src/QuickTest.cpp less more

256	256	cerr << "NTL_SAFE_VECTORS\n";
257	257	#endif
258	258
	259	#ifdef NTL_ENABLE_AVX_FFT
	260	cerr << "NTL_ENABLE_AVX_FFT\n";
	261	#endif
	262
	263	#ifdef NTL_AVOID_AVX512
	264	cerr << "NTL_AVOID_AVX512\n";
	265	#endif
	266
259	267	#ifdef NTL_RANGE_CHECK
260	268	cerr << "NTL_RANGE_CHECK\n";
261	269	#endif

+81

-0

src/SSMulTest.cpp less more

	0	#include <NTL/ZZX.h>
	1
	2	NTL_CLIENT
	3
	4
	5	#define TIME_IT(t, action) \
	6	do { \
	7	double _t0, _t1; \
	8	long _iter = 1; \
	9	long _cnt = 0; \
	10	do { \
	11	_t0 = GetTime(); \
	12	for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
	13	_t1 = GetTime(); \
	14	} while ( _t1 - _t0 < 2 && (_iter *= 2)); \
	15	t = (_t1 - _t0)/_iter; \
	16	} while(0)
	17
	18	void FillRandom(ZZX& f, long n, long k)
	19	{
	20	long sw = RandomBnd(2);
	21	f.SetLength(n);
	22	for (long i = 0; i < n; i++) {
	23	if (sw) {
	24	long kk = 1 + RandomBnd(k);
	25	RandomBits(f[i], kk);
	26	}
	27	else {
	28	long kk = RandomBnd(k);
	29	SetBit(f[i], kk);
	30	}
	31	if (RandomBnd(2)) NTL::negate(f[i], f[i]);
	32	}
	33	f.normalize();
	34	}
	35
	36	int main()
	37	{
	38
	39	for (long iter = 0; iter < 4000; iter++) {
	40	if (iter % 100 == 0) cerr << ".";
	41	long na, nb, k;
	42
	43	long sw = RandomBnd(3);
	44
	45	if (sw == 0) {
	46	na = RandomBnd(20) + 1;
	47	nb = RandomBnd(20) + 1;
	48	k = RandomBnd(20) + 1;
	49	}
	50	else if (sw == 1) {
	51	na = RandomBnd(200) + 10;
	52	nb = RandomBnd(200) + 10;
	53	k = RandomBnd(200) + 10;
	54	}
	55	else {
	56	na = RandomBnd(3000) + 100;
	57	nb = RandomBnd(3000) + 100;
	58	k = RandomBnd(3000) + 100;
	59	}
	60
	61	ZZX a, b, c, c1;
	62	FillRandom(a, na, k);
	63	FillRandom(b, nb, k);
	64
	65	if (RandomBnd(2)) {
	66	SSMul(c, a, b);
	67	KarMul(c1, a, b);
	68	if (c != c1) Error("oops");
	69	}
	70	else {
	71	SSSqr(c, a);
	72	KarSqr(c1, a);
	73	if (c != c1) Error("oops");
	74	}
	75	}
	76
	77	cerr << "\n";
	78	}
	79
	80

+51

-0

src/TestScript less more

6	6	echo "running ZZTest"
7	7	./ZZTest
8	8	sh RemoveProg ZZTest
	9
	10	echo
	11	echo "---------------------------------"
	12	echo "making SSMulTest"
	13	$MAKE_PROG SSMulTest
	14	echo "running SSMulTest"
	15	./SSMulTest
	16	sh RemoveProg SSMulTest
	17
	18
	19	echo
	20	echo "---------------------------------"
	21	echo "making ZZ_pXTest"
	22	$MAKE_PROG ZZ_pXTest
	23	echo "running ZZ_pXTest"
	24	./ZZ_pXTest
	25	sh RemoveProg ZZ_pXTest
	26
	27	echo
	28	echo "---------------------------------"
	29	echo "making lzz_pXTest"
	30	$MAKE_PROG lzz_pXTest
	31	echo "running lzz_pXTest"
	32	./lzz_pXTest
	33	sh RemoveProg lzz_pXTest
9	34
10	35	echo
11	36	echo "---------------------------------"

74	99	./GF2EXTest
75	100	sh RemoveProg GF2EXTest
76	101
	102
	103	echo
	104	echo "---------------------------------"
	105	echo "making GF2EXGCDTest"
	106	$MAKE_PROG GF2EXGCDTest
	107	echo "running GF2EXGCDTest"
	108	./GF2EXGCDTest
	109	sh RemoveProg GF2EXGCDTest
77	110
78	111	echo
79	112	echo "---------------------------------"

188	221	./ZZ_pEXTest
189	222	sh RemoveProg ZZ_pEXTest
190	223
	224	echo
	225	echo "---------------------------------"
	226	echo "making ZZ_pEXGCDTest"
	227	$MAKE_PROG ZZ_pEXGCDTest
	228	echo "running ZZ_pEXGCDTest"
	229	./ZZ_pEXGCDTest
	230	sh RemoveProg ZZ_pEXGCDTest
	231
	232
191	233
192	234	echo
193	235	echo "---------------------------------"

200	242
201	243	echo
202	244	echo "---------------------------------"
	245	echo "making lzz_pEXGCDTest"
	246	$MAKE_PROG lzz_pEXGCDTest
	247	echo "running lzz_pEXGCDTest"
	248	./lzz_pEXGCDTest
	249	sh RemoveProg lzz_pEXGCDTest
	250
	251
	252	echo
	253	echo "---------------------------------"
203	254	echo "making ThreadTest"
204	255	$MAKE_PROG ThreadTest
205	256	echo "running ThreadTest"

-1

src/VERSION_INFO less more

0		36:0:0
	0	39:0:0

-1

src/WINDIR less more

0		WinNTL-11_0_0
	0	WinNTL-11_3_0

-0

src/Wizard less more

59	59
60	60	sh CopyFeatures '..' small "$3"
61	61	cp ../include/NTL/FFT.h small/include/NTL
	62	cp ../include/NTL/FFT_impl.h small/include/NTL
62	63	cp ../include/NTL/ctools.h small/include/NTL
63	64	cp ../include/NTL/ZZ.h small/include/NTL
64	65	cp ../include/NTL/sp_arith.h small/include/NTL

-2

src/WizardAux less more

148	148	next;
149	149	}
150	150
151		print "run: $aflag1 $bflag1 $cflag1 NTL_FFT_BIGTAB\n";
	151	print "run: $aflag1 $bflag1 $cflag1\n";
152	152	GenConfigHeader();
153	153	$time1 = RunProg("Poly1TimeTest");
154	154

173	173
174	174	# now see if BIGTAB really helps
175	175
176
177	176	$Config{"NTL_FFT_BIGTAB"} = 0;
178	177	print "run: $aflag $bflag $cflag default\n";
179	178	GenConfigHeader();

-1

src/WordVector.cpp less more

54	54
55	55	if (frozen) LogicError("Cannot grow this WordVector");
56	56
57		m = max(n, long(NTL_WordVectorExpansionRatio*max_length));
	57	m = max(n, _ntl_vec_grow(max_length));
58	58
59	59	m = ((m+NTL_WordVectorMinAlloc-1)/NTL_WordVectorMinAlloc)*NTL_WordVectorMinAlloc;
60	60	_ntl_ulong *p = rep - 2;

-8

src/ZZ.cpp less more

1445	1445
1446	1446
1447	1447
1448		void sub(ZZ& x, const ZZ& a, long b)
1449		{
1450		NTL_ZZRegister(B);
1451		conv(B, b);
1452		sub(x, a, B);
1453		}
1454
1455	1448	void sub(ZZ& x, long a, const ZZ& b)
1456	1449	{
1457	1450	NTL_ZZRegister(A);

1936	1929	}
1937	1930
1938	1931
1939		void old_RandomStream::do_get(unsigned char *NTL_RESTRICT res, long n)
	1932	void old_RandomStream::do_get(unsigned char *res, long n)
1940	1933	{
1941	1934	if (n < 0) LogicError("RandomStream::get: bad args");
1942	1935

+86

-16

src/ZZTest.cpp less more

59	59	CHECK(q1.validate() && r1.validate() && q == q1 && r == r1);
60	60	}
61	61
	62	cerr << "\nvalidating mul...";
	63	for (long i = 0; i < 1000000; i++) {
	64	long a_len = RandomBnd(1000)+1;
	65	long b_len = RandomBnd(1000)+1;
	66
	67	ZZ a, b, c;
	68
	69	RandomLen(a, a_len);
	70	RandomLen(b, b_len);
	71
	72	if (RandomBnd(2)) a = -a;
	73	if (RandomBnd(2)) b = -b;
	74
	75	long p = 7919;
	76	long r = MulMod(rem(a, p), rem(b, p), p);
	77	long s = MulMod(rem(a, p), rem(a, p), p);
	78
	79	switch (RandomBnd(5)) {
	80	case 0:
	81	mul(c, a, b);
	82	CHECK(c.validate() && rem(c, p) == r);
	83	break;
	84
	85	case 1:
	86	mul(a, a, b);
	87	CHECK(a.validate() && rem(a, p) == r);
	88	break;
	89
	90	case 2:
	91	mul(b, a, b);
	92	CHECK(b.validate() && rem(b, p) == r);
	93	break;
	94
	95	case 3:
	96	mul(c, a, a);
	97	CHECK(c.validate() && rem(c, p) == s);
	98	break;
	99
	100	case 4:
	101	mul(a, a, a);
	102	CHECK(a.validate() && rem(a, p) == s);
	103	break;
	104	}
	105	}
	106
62	107	cerr << "\nvalidating squaring...";
63		for (long i = 0; i < 200000; i++) {
64		long a_len = RandomBnd(8000)+5;
65
66		ZZ a, b, a1, c;
67		RandomLen(a, a_len);
68
69		sqr(b, a);
	108	for (long i = 0; i < 1000000; i++) {
	109	long a_len = RandomBnd(1000)+1;
	110
	111	ZZ a, b, a1, a2, c;
	112	RandomLen(a, a_len);
	113
	114	if (RandomBnd(2)) a = -a;
	115
70	116	a1 = a;
71		mul(c, a, a1);
72
73		CHECK(b.validate() && c.validate() && b == c);
	117	a2 = a;
	118
	119	if (RandomBnd(2)) {
	120	sqr(b, a);
	121	mul(c, a1, a2);
	122	CHECK(b.validate() && c.validate() && b == c);
	123	}
	124	else {
	125	sqr(a, a);
	126	mul(c, a1, a2);
	127	CHECK(a.validate() && c.validate() && a == c);
	128	}
74	129	}
75	130
76	131	cerr << "\nvalidating SqrRoot...";

155	210	}
156	211
157	212	cerr << "\nvalidating GCD...";
158		for (long i = 0; i < 100000; i++) {
159		long a_len = RandomBnd(4000)+5;
160		long b_len = RandomBnd(4000)+5;
161		long c_len = RandomBnd(500)+1;
	213	for (long i = 0; i < 1000000; i++) {
	214	long a_len = RandomBnd(1000)+1;
	215	long b_len = RandomBnd(1000)+1;
	216	long c_len = RandomBnd(200)+1;
162	217
163	218	ZZ a, b, c;
164	219	RandomLen(a, a_len);

167	222
168	223	a *= c;
169	224	b *= c;
	225
	226	if (RandomBnd(2)) a = -a;
	227	if (RandomBnd(2)) b = -b;
170	228
171	229	ZZ d, s, t, d1;
172	230

176	234	CHECK(d.validate() && s.validate() && t.validate() && d1.validate());
177	235	CHECK(d == d1 && d == as + bt);
178	236	CHECK(divide(a, d) && divide(b, d));
	237
	238	CHECK(abs(s) <= 1 \|\| 2dabs(s) < abs(b));
	239	CHECK(abs(t) <= 1 \|\| 2dabs(t) < abs(a));
	240
	241	if (a < 0) { a = -a; s = -s; }
	242	if (b < 0) { b = -b; t = -t; }
	243	if (a < b) { swap(a, b); swap(s, t); }
	244
	245	// so now we have a >= b >= 0
	246	// check that s in (-b/2d, b/2d]
	247	CHECK(2ds > -b && 2ds <= b);
179	248	}
180	249
181	250	cerr << "\nvalidating InvMod...";
182	251	for (long i = 0; i < 100000; i++) {
183		long n_len = RandomBnd(4000)+5;
	252	long n_len = RandomBnd(4000)+4;
184	253
185	254	ZZ a, n, x;
186	255	RandomLen(n, n_len);
187	256	RandomBnd(a, n);
188	257
189	258	long r = InvModStatus(x, a, n);
190		CHECK((r == 0 && (x * a) % n == 1) \|\| (r == 1 && x != 1 && x == GCD(a, n)) );
	259	CHECK((r == 0 && (x * a) % n == 1 && 0 <= x && x < n) \|\|
	260	(r == 1 && x != 1 && x == GCD(a, n)) );
191	261	}
192	262
193	263	cerr << "\nvalidating RatRecon...";

+597

-145

src/ZZX1.cpp less more

756	756
757	757
758	758
759
760		/* Compute a = b * 2^l mod p, where p = 2^n+1. 0<=l<=n and 0<b<p are
	759	static void
	760	SS_AddMod(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
	761	// x = a + b mod p, where p = 2^n+1, a, b in [0, p).
	762	// x may not alias p.
	763	{
	764	#ifndef NTL_PROVIDES_SS_LIP_IMPL
	765	add(x, a, b);
	766	if (x >= p) {
	767	x--; SwitchBit(x, n); // x -= p
	768	}
	769	#else
	770	SS_AddMod_lip_impl(x, a, b, p, n);
	771	#endif
	772	}
	773
	774	static void
	775	SS_SubMod(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
	776	// x = a - b mod p, where p = 2^n+1, a, b in [0, p).
	777	// x may not alias b or p.
	778	{
	779	#ifndef NTL_PROVIDES_SS_LIP_IMPL
	780	if (a < b) {
	781	add(x, a, p);
	782	SubPos(x, x, b);
	783	}
	784	else {
	785	SubPos(x, a, b);
	786	}
	787	#else
	788	SS_SubMod_lip_impl(x, a, b, p, n);
	789	#endif
	790	}
	791
	792
	793
	794	/* Compute a = b * 2^e mod p, where p = 2^n+1. 0<=e<n and 0<b<p are
761	795	assumed. */
762		static void LeftRotate(ZZ& a, const ZZ& b, long l, const ZZ& p, long n, ZZ& scratch)
763		{
764		if (l == 0) {
	796
	797	static void
	798	LeftRotate(ZZ& a, const ZZ& b, long e, const ZZ& p, long n, ZZ& scratch)
	799	{
	800	#ifndef NTL_PROVIDES_SS_LIP_IMPL
	801	if (e == 0) {
765	802	if (&a != &b) {
766	803	a = b;
767	804	}
768	805	return;
769	806	}
770	807
771		/* scratch := upper l bits of b */
772		RightShift(scratch, b, n - l);
773		/* a := 2^l * lower n - l bits of b */
774		trunc(a, b, n - l);
775		LeftShift(a, a, l);
	808	/* scratch := upper e bits of b */
	809	RightShift(scratch, b, n - e);
	810	/* a := 2^e * lower n - e bits of b */
	811	trunc(a, b, n - e);
	812	LeftShift(a, a, e);
776	813	/* a -= scratch */
777		sub(a, a, scratch);
778		if (sign(a) < 0) {
779		add(a, a, p);
780		}
781		}
782
783
784		/* Compute a = b * 2^l mod p, where p = 2^n+1. 0<=p<b is assumed. */
785		static void Rotate(ZZ& a, const ZZ& b, long l, const ZZ& p, long n, ZZ& scratch)
786		{
787		if (IsZero(b)) {
788		clear(a);
789		return;
790		}
791
792		/* l %= 2n */
793		if (l >= 0) {
794		l %= (n << 1);
795		} else {
796		l = (n << 1) - 1 - (-(l + 1) % (n << 1));
797		}
798
799		/* a = b * 2^l mod p */
800		if (l < n) {
801		LeftRotate(a, b, l, p, n, scratch);
802		} else {
803		LeftRotate(a, b, l - n, p, n, scratch);
804		SubPos(a, p, a);
805		}
806		}
807
808
809
810		/* Fast Fourier Transform. a is a vector of length 2^l, 2^l divides 2n,
811		p = 2^n+1, w = 2^r mod p is a primitive (2^l)th root of
812		unity. Returns a(1),a(w),...,a(w^{2^l-1}) mod p in bit-reverse
813		order. */
814		static void fft(ZZVec& a, long r, long l, const ZZ& p, long n)
815		{
816		long round;
817		long off, i, j, e;
818		long halfsize;
819		ZZ tmp, tmp1;
820		ZZ scratch;
821
822		for (round = 0; round < l; round++, r <<= 1) {
823		halfsize = 1L << (l - 1 - round);
824		for (i = (1L << round) - 1, off = 0; i >= 0; i--, off += halfsize) {
825		for (j = 0, e = 0; j < halfsize; j++, off++, e+=r) {
826		/* One butterfly :
827		( a[off], a[off+halfsize] ) *= ( 1 w^{j2^round} )
828		( 1 -w^{j2^round} ) */
829		/* tmp = a[off] - a[off + halfsize] mod p */
830		sub(tmp, a[off], a[off + halfsize]);
831		if (sign(tmp) < 0) {
832		add(tmp, tmp, p);
833		}
834		/* a[off] += a[off + halfsize] mod p */
835		add(a[off], a[off], a[off + halfsize]);
836		sub(tmp1, a[off], p);
837		if (sign(tmp1) >= 0) {
838		a[off] = tmp1;
839		}
840		/* a[off + halfsize] = tmp * w^{j2^round} mod p */
841		Rotate(a[off + halfsize], tmp, e, p, n, scratch);
842		}
	814	SS_SubMod(a, a, scratch, p, n);
	815	#else
	816	LeftRotate_lip_impl(a, b, e, p, n, scratch);
	817	#endif
	818	}
	819
	820
	821	#define SS_FFT_THRESH (4)
	822	#define SS_NTEMPS (3)
	823	#define SS_FFT_RDUP (3)
	824
	825	static long
	826	SS_FFTRoundUp(long xn, long k)
	827	{
	828	long n = 1L << k;
	829	if (xn <= 0) return n;
	830
	831	xn = ((xn+((1L << SS_FFT_RDUP)-1)) >> SS_FFT_RDUP) << SS_FFT_RDUP;
	832
	833	if (xn > n - (n >> 4)) xn = n;
	834
	835	return xn;
	836	}
	837
	838
	839
	840	// p = 2^n+1, where n = r*2^{l-1}, so 2^r is primitive 2^l-th root
	841	// of unity mod p.
	842
	843	// j in [0, 2^{level-1})
	844	// a = b2^{jr*2^{l-level}}
	845	static void
	846	Rotate(ZZ& a, const ZZ& b, long j, long level,
	847	long r, long l, const ZZ& p, long n, ZZ* tmp)
	848	{
	849	if (l-level >= 0)
	850	LeftRotate(a, b, (j*r) << (l-level), p, n, tmp[0]);
	851	else if (((j*r) & 1) == 0)
	852	LeftRotate(a, b, (j*r) >> 1, p, n, tmp[0]);
	853	else {
	854	// use sqrt(2) = 2^{3n/4} - 2^{n/4}
	855
	856	long k = (jr) >> 1; // jr = 2*k + 1
	857
	858	// now compute a = b*2^{k+1/2} mod p
	859
	860	// a = b*{2^k} mod p
	861	LeftRotate(a, b, k, p, n, tmp[0]);
	862
	863	// tmp[1] = a*2^{n/4} mod p
	864	LeftRotate(tmp[1], a, n >> 2, p, n, tmp[0]);
	865
	866	// a = a*2^{3n/4} mod p
	867	LeftRotate(a, a, 3*(n >> 2), p, n, tmp[0]);
	868
	869	// a -= tmp[1] mod p
	870	SS_SubMod(a, a, tmp[1], p, n);
	871	}
	872	}
	873
	874
	875	static void
	876	SS_butterfly(ZZ& x, ZZ& y, const ZZ& p, long n, ZZ* tmp)
	877	// (x, y) := (x+y, x-y)
	878	{
	879	/* tmp[0] = x - y mod p */
	880	SS_SubMod(tmp[0], x, y, p, n);
	881
	882	/* x += y mod p */
	883	SS_AddMod(x, x, y, p, n);
	884
	885	y = tmp[0];
	886	}
	887
	888	static void
	889	SS_fwd_butterfly(ZZ& x, ZZ& y, long j, long level,
	890	long r, long l, const ZZ& p, long n,
	891	ZZ* tmp)
	892
	893	// ( x, y ) = ( 1 2^{jr*2^{l-level}} )
	894	// ( 1 -2^{jr2^{l-level}} )
	895
	896	{
	897	/* tmp[0] = x - y mod p */
	898	SS_SubMod(tmp[0], x, y, p, n);
	899
	900	/* x += y mod p */
	901	SS_AddMod(x, x, y, p, n);
	902
	903	/* y = tmp[0] * 2^{jr2^{l-level}} mod p */
	904	Rotate(y, tmp[0], j, level, r, l, p, n, tmp+1);
	905	}
	906
	907	static void
	908	SS_inv_butterfly(ZZ& x, ZZ& y, long j, long level,
	909	long r, long l, const ZZ& p, long n,
	910	ZZ* tmp)
	911
	912	// ( x, y ) *= ( 1 1 )
	913	// ( 2^{-jr2^{l-level}} -2^{-jr2^{l-level}} )
	914
	915	// *** should not be called with j == 0
	916	// call SS_butterfly instead
	917
	918	{
	919	/* tmp[0] = y * 2^{(2^{level-1}-j)r2^{l-level}} mod p */
	920	Rotate(tmp[0], y, (1L<<(level-1))-j, level, r, l, p, n, tmp+1);
	921
	922	/* y = x + tmp[0] mod p */
	923	SS_AddMod(y, x, tmp[0], p, n); // NEGATED
	924
	925	/* x = x - tmp[0] mod p */
	926	SS_SubMod(x, x, tmp[0], p, n); // NEGATED
	927	}
	928
	929
	930	// Much of the following logic is taken from the code in FFT.cpp
	931	// for single-precision modular FFT's, which itself is adapted
	932	// from code originally written by David Harvey.
	933	// See copyright notice in FFT.cpp.
	934
	935	// size == 2^level
	936	static void
	937	fft_layer(ZZ* xp, long blocks, long size, long level, long r, long l,
	938	const ZZ& p, long n, ZZ* tmp)
	939	{
	940	size /= 2;
	941
	942	do {
	943	ZZ *xp0 = xp;
	944	ZZ *xp1 = xp + size;
	945
	946	for (long j = 0; j < size; j++)
	947	SS_fwd_butterfly(xp0[j], xp1[j], j, level, r, l, p, n, tmp);
	948
	949	xp += 2*size;
	950	} while (--blocks != 0);
	951	}
	952
	953	static void
	954	fft_base(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
	955	ZZ* tmp)
	956	{
	957	long N = 1L << lgN;
	958
	959	for (long j = lgN, size = N, blocks = 1;
	960	j >= 1; j--, blocks <<= 1, size >>= 1)
	961	fft_layer(xp, blocks, size, j, r, l, p, n, tmp);
	962	}
	963
	964
	965	static void
	966	fft_rec(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
	967	ZZ* tmp)
	968	{
	969	if (lgN <= SS_FFT_THRESH) {
	970	fft_base(xp, lgN, r, l, p, n, tmp);
	971	return;
	972	}
	973
	974	long N = 1L << lgN;
	975	long half = N >> 1;
	976
	977	ZZ *xp0 = xp;
	978	ZZ *xp1 = xp + half;
	979
	980	for (long j = 0; j < half; j++)
	981	SS_fwd_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
	982
	983	fft_rec(xp0, lgN-1, r, l, p, n, tmp);
	984	fft_rec(xp1, lgN-1, r, l, p, n, tmp);
	985	}
	986
	987
	988	static void
	989	fft_short(ZZ* xp, long yn, long xn, long lgN,
	990	long r, long l, const ZZ& p, long n,
	991	ZZ* tmp)
	992	{
	993	long N = 1L << lgN;
	994
	995	if (yn == N)
	996	{
	997	if (xn == N && lgN <= SS_FFT_THRESH)
	998	{
	999	// no truncation
	1000	fft_base(xp, lgN, r, l, p, n, tmp);
	1001	return;
	1002	}
843	1003	}
844		}
845		}
846
847		/* Inverse FFT. r must be the same as in the call to FFT. Result is
848		by 2^l too large. */
849		static void ifft(ZZVec& a, long r, long l, const ZZ& p, long n)
850		{
851		long round;
852		long off, i, j, e;
853		long halfsize;
854		ZZ tmp, tmp1;
855		ZZ scratch;
856
857		for (round = l - 1, r <<= l - 1; round >= 0; round--, r >>= 1) {
858		halfsize = 1L << (l - 1 - round);
859		for (i = (1L << round) - 1, off = 0; i >= 0; i--, off += halfsize) {
860		for (j = 0, e = 0; j < halfsize; j++, off++, e+=r) {
861		/* One inverse butterfly :
862		( a[off], a[off+halfsize] ) *= ( 1 1 )
863		( w^{-j2^round} -w^{-j2^round} ) */
864		/* a[off + halfsize] = w^{-j2^round} mod p /
865		Rotate(a[off + halfsize], a[off + halfsize], -e, p, n, scratch);
866		/* tmp = a[off] - a[off + halfsize] */
867		sub(tmp, a[off], a[off + halfsize]);
868
869		/* a[off] += a[off + halfsize] mod p */
870		add(a[off], a[off], a[off + halfsize]);
871		sub(tmp1, a[off], p);
872		if (sign(tmp1) >= 0) {
873		a[off] = tmp1;
874		}
875		/* a[off+halfsize] = tmp mod p */
876		if (sign(tmp) < 0) {
877		add(a[off+halfsize], tmp, p);
878		} else {
879		a[off+halfsize] = tmp;
880		}
881		}
	1004
	1005
	1006	// divide-and-conquer algorithm
	1007
	1008	long half = N >> 1;
	1009
	1010	if (yn <= half)
	1011	{
	1012	if (xn <= half)
	1013	{
	1014	fft_short(xp, yn, xn, lgN-1, r, l, p, n, tmp);
	1015	}
	1016	else
	1017	{
	1018	xn -= half;
	1019
	1020	// (X, Y) -> X + Y
	1021	for (long j = 0; j < xn; j++)
	1022	SS_AddMod(xp[j], xp[j], xp[j + half], p, n);
	1023
	1024	fft_short(xp, yn, half, lgN-1, r, l, p, n, tmp);
	1025	}
882	1026	}
883		}
884		}
	1027	else
	1028	{
	1029	yn -= half;
	1030
	1031	ZZ *xp0 = xp;
	1032	ZZ *xp1 = xp + half;
	1033
	1034	if (xn <= half)
	1035	{
	1036	// X -> (X, w*X)
	1037	for (long j = 0; j < xn; j++)
	1038	Rotate(xp1[j], xp0[j], j, lgN, r, l, p, n, tmp);
	1039
	1040	fft_short(xp0, half, xn, lgN-1, r, l, p, n, tmp);
	1041	fft_short(xp1, yn, xn, lgN-1, r, l, p, n, tmp);
	1042	}
	1043	else
	1044	{
	1045	xn -= half;
	1046
	1047	// (X, Y) -> (X + Y, w*(X - Y))
	1048	for (long j = 0; j < xn; j++)
	1049	SS_fwd_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
	1050
	1051	// X -> (X, w*X)
	1052	for (long j = xn; j < half; j++)
	1053	Rotate(xp1[j], xp0[j], j, lgN, r, l, p, n, tmp);
	1054
	1055	fft_short(xp0, half, half, lgN-1, r, l, p, n, tmp);
	1056	fft_short(xp1, yn, half, lgN-1, r, l, p, n, tmp);
	1057	}
	1058	}
	1059	}
	1060
	1061
	1062
	1063	static void
	1064	fft(ZZVec& a, long r, long l, const ZZ& p, long n)
	1065	{
	1066	ZZ tmp[SS_NTEMPS];
	1067	fft_rec(&a[0], l, r, l, p, n, &tmp[0]);
	1068	}
	1069
	1070	static void
	1071	fft1(ZZVec& a, long r, long l, long l1, const ZZ& p, long n)
	1072	{
	1073	ZZ tmp[SS_NTEMPS];
	1074	fft_rec(&a[0], l, r, l1, p, n, &tmp[0]);
	1075	}
	1076
	1077	static void
	1078	fft_trunc(ZZVec& a, long yn, long xn,
	1079	long r, long l, long l1, const ZZ& p, long n)
	1080	{
	1081	ZZ tmp[SS_NTEMPS];
	1082	fft_short(&a[0], yn, xn, l, r, l1, p, n, &tmp[0]);
	1083	}
	1084
	1085	static void
	1086	ifft_layer(ZZ* xp, long blocks, long size, long level, long r, long l,
	1087	const ZZ& p, long n, ZZ* tmp)
	1088	{
	1089	size /= 2;
	1090
	1091	do {
	1092	ZZ *xp0 = xp;
	1093	ZZ *xp1 = xp + size;
	1094
	1095	SS_butterfly(xp0[0], xp1[0], p, n, tmp);
	1096	for (long j = 1; j < size; j++)
	1097	SS_inv_butterfly(xp0[j], xp1[j], j, level, r, l, p, n, tmp);
	1098
	1099	xp += 2*size;
	1100	} while (--blocks != 0);
	1101	}
	1102
	1103	static void
	1104	ifft_base(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
	1105	ZZ* tmp)
	1106	{
	1107	long N = 1L << lgN;
	1108
	1109	for (long j = 1, size = 2, blocks = N/2;
	1110	j <= lgN; j++, blocks >>= 1, size <<= 1)
	1111	ifft_layer(xp, blocks, size, j, r, l, p, n, tmp);
	1112	}
	1113
	1114
	1115	static void
	1116	ifft_rec(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
	1117	ZZ* tmp)
	1118	{
	1119	if (lgN <= SS_FFT_THRESH) {
	1120	ifft_base(xp, lgN, r, l, p, n, tmp);
	1121	return;
	1122	}
	1123
	1124	long N = 1L << lgN;
	1125	long half = N >> 1;
	1126
	1127	ZZ *xp0 = xp;
	1128	ZZ *xp1 = xp + half;
	1129
	1130	ifft_rec(xp0, lgN-1, r, l, p, n, tmp);
	1131	ifft_rec(xp1, lgN-1, r, l, p, n, tmp);
	1132
	1133	SS_butterfly(xp0[0], xp1[0], p, n, tmp);
	1134	for (long j = 1; j < half; j++)
	1135	SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
	1136	}
	1137
	1138	static void
	1139	ifft_short2(ZZ* xp, long yn, long lgN,
	1140	long r, long l, const ZZ& p, long n, ZZ* tmp);
	1141
	1142	static void
	1143	ifft_short1(ZZ* xp, long yn, long lgN,
	1144	long r, long l, const ZZ& p, long n, ZZ* tmp)
	1145
	1146	{
	1147	long N = 1L << lgN;
	1148
	1149	if (yn == N && lgN <= SS_FFT_THRESH)
	1150	{
	1151	// no truncation
	1152	ifft_base(xp, lgN, r, l, p, n, tmp);
	1153	return;
	1154	}
	1155
	1156	// divide-and-conquer algorithm
	1157
	1158	long half = N >> 1;
	1159
	1160	if (yn <= half)
	1161	{
	1162	// X -> 2X
	1163	for (long j = 0; j < yn; j++)
	1164	SS_AddMod(xp[j], xp[j], xp[j], p, n);
	1165
	1166	ifft_short1(xp, yn, lgN-1, r, l, p, n, tmp);
	1167	}
	1168	else
	1169	{
	1170	ZZ *xp0 = xp;
	1171	ZZ *xp1 = xp + half;
	1172
	1173	ifft_short1(xp0, half, lgN-1, r, l, p, n, tmp);
	1174
	1175	yn -= half;
	1176
	1177	// X -> (2X, w*X)
	1178	for (long j = yn; j < half; j++)
	1179	{
	1180	tmp[0] = xp0[j];
	1181	SS_AddMod(xp0[j], xp0[j], xp0[j], p, n);
	1182	Rotate(xp1[j], tmp[0], j, lgN, r, l, p, n, tmp+1);
	1183	}
	1184
	1185	ifft_short2(xp1, yn, lgN-1, r, l, p, n, tmp);
	1186
	1187	// (X, Y) -> (X + Y/w, X - Y/w)
	1188	SS_butterfly(xp0[0], xp1[0], p, n, tmp);
	1189	for (long j = 1; j < yn; j++)
	1190	SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
	1191	}
	1192	}
	1193
	1194
	1195	static void
	1196	ifft_short2(ZZ* xp, long yn, long lgN,
	1197	long r, long l, const ZZ& p, long n, ZZ* tmp)
	1198
	1199	{
	1200	long N = 1L << lgN;
	1201
	1202	if (yn == N && lgN <= SS_FFT_THRESH)
	1203	{
	1204	// no truncation
	1205	ifft_base(xp, lgN, r, l, p, n, tmp);
	1206	return;
	1207	}
	1208
	1209	// divide-and-conquer algorithm
	1210
	1211	long half = N >> 1;
	1212
	1213	if (yn <= half)
	1214	{
	1215	// X -> 2X
	1216	for (long j = 0; j < yn; j++)
	1217	SS_AddMod(xp[j], xp[j], xp[j], p, n);
	1218
	1219	// (X, Y) -> X + Y
	1220	for (long j = yn; j < half; j++)
	1221	SS_AddMod(xp[j], xp[j], xp[j + half], p, n);
	1222
	1223	ifft_short2(xp, yn, lgN-1, r, l, p, n, tmp);
	1224
	1225	// (X, Y) -> X - Y
	1226	for (long j = 0; j < yn; j++)
	1227	SS_SubMod(xp[j], xp[j], xp[j + half], p, n);
	1228	}
	1229	else
	1230	{
	1231	ZZ *xp0 = xp;
	1232	ZZ *xp1 = xp + half;
	1233
	1234	ifft_short1(xp0, half, lgN-1, r, l, p, n, tmp);
	1235
	1236	yn -= half;
	1237
	1238	// (X, Y) -> (2X - Y, w*(X - Y))
	1239	for (long j = yn; j < half; j++)
	1240	{
	1241	SS_SubMod(tmp[0], xp0[j], xp1[j], p, n);
	1242	SS_AddMod(xp0[j], xp0[j], tmp[0], p, n);
	1243	Rotate(xp1[j], tmp[0], j, lgN, r, l, p, n, tmp+1);
	1244	}
	1245
	1246
	1247	ifft_short2(xp1, yn, lgN-1, r, l, p, n, tmp);
	1248
	1249	// (X, Y) -> (X + Y/w, X - Y/w)
	1250	SS_butterfly(xp0[0], xp1[0], p, n, tmp);
	1251	for (long j = 1; j < yn; j++)
	1252	SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
	1253	}
	1254	}
	1255
	1256
	1257	static void
	1258	ifft(ZZVec& a, long r, long l, const ZZ& p, long n)
	1259	{
	1260	ZZ tmp[SS_NTEMPS];
	1261	ifft_rec(&a[0], l, r, l, p, n, &tmp[0]);
	1262	}
	1263
	1264	static void
	1265	ifft1(ZZVec& a, long r, long l, long l1, const ZZ& p, long n)
	1266	{
	1267	ZZ tmp[SS_NTEMPS];
	1268	ifft_rec(&a[0], l, r, l1, p, n, &tmp[0]);
	1269	}
	1270
	1271	static void
	1272	ifft_trunc(ZZVec& a, long yn, long r, long l, long l1, const ZZ& p, long n)
	1273	{
	1274	ZZ tmp[SS_NTEMPS];
	1275	ifft_short1(&a[0], yn, l, r, l1, p, n, &tmp[0]);
	1276	}
	1277
	1278
885	1279
886	1280
887	1281

893	1287	absolute value. The algorithm is not called recursively;
894	1288	coefficient arithmetic is done directly.*/
895	1289
	1290	// The original version of SSMUl was written by Juergen Gerhard.
	1291	// However, it has been almost completely re-written so as
	1292	// to provide the following improvements:
	1293	// * uses truncated FFT and Inverse FFT algorithms,
	1294	// for better performance between powers of 2
	1295	// * better cache locality because of divide and conquer structure
	1296	// * better performance because of sqrt(2) trick
	1297
896	1298	void SSMul(ZZX& c, const ZZX& a, const ZZX& b)
897	1299	{
898	1300	if (&a == &b) {

913	1315
914	1316	/* Choose m and r suitably */
915	1317	long l = NextPowerOfTwo(n + 1) - 1; /* 2^l <= n < 2^{l+1} */
916		long m2 = 1L << (l + 1); /* m2 = 2m = 2^{l+1} */
	1318	long N = 1L << (l + 1); /* N = 2^{l+1} */
917	1319	/* Bitlength of the product: if the coefficients of a are absolutely less
918	1320	than 2^ka and the coefficients of b are absolutely less than 2^kb, then
919	1321	the coefficients of ab are absolutely less than

923	1325	long r = (bound >> l) + 1;
924	1326	long mr = r << l;
925	1327
	1328	// sqrt(2) trick
	1329	long l1 = l;
	1330	if (l1 >= 3) {
	1331	long alt_l1 = l-1;
	1332	long alt_r = (bound >> alt_l1) + 1;
	1333	long alt_mr = alt_r << alt_l1;
	1334
	1335	if (alt_mr < mr - mr/8) {
	1336	l1 = alt_l1;
	1337	r = alt_r;
	1338	mr = alt_mr;
	1339	}
	1340	}
	1341
926	1342	/* p := 2^{mr}+1 */
927	1343	ZZ p;
928	1344	set(p);

931	1347
932	1348	/* Make coefficients of a and b positive */
933	1349	ZZVec aa, bb;
934		aa.SetSize(m2, p.size());
935		bb.SetSize(m2, p.size());
	1350	aa.SetSize(N, p.size());
	1351	bb.SetSize(N, p.size());
936	1352
937	1353	for (long i = 0; i <= deg(a); i++) {
938	1354	if (sign(a.rep[i]) >= 0) {

950	1366	}
951	1367	}
952	1368
953
954		/* 2m-point FFT's mod p */
955		fft(aa, r, l + 1, p, mr);
956		fft(bb, r, l + 1, p, mr);
	1369	long yn = SS_FFTRoundUp(n+1, l+1);
	1370
	1371	/* N-point FFT's mod p */
	1372	fft_trunc(aa, yn, SS_FFTRoundUp(na+1, l+1), r, l+1, l1+1, p, mr);
	1373	fft_trunc(bb, yn, SS_FFTRoundUp(nb+1, l+1), r, l+1, l1+1, p, mr);
957	1374
958	1375
959	1376	/* Pointwise multiplication aa := aa * bb mod p */
960	1377	// NOTE: we attempt to parallelize this
961	1378	// Unfortunately, the bulk of the time is spent
962	1379	// in the FFT, so this is not very effective
963		NTL_EXEC_RANGE(m2, first, last)
	1380	NTL_EXEC_RANGE(yn, first, last)
964	1381	ZZ tmp, ai;
965	1382	for (long i = first; i < last; i++) {
966	1383	mul(ai, aa[i], bb[i]);

976	1393	}
977	1394	NTL_EXEC_RANGE_END
978	1395
979		ifft(aa, r, l + 1, p, mr);
980
981		/* Retrieve c, dividing by 2m, and subtracting p where necessary */
	1396	ifft_trunc(aa, yn, r, l+1, l1+1, p, mr);
	1397
	1398	/* Retrieve c, dividing by N, and subtracting p where necessary */
982	1399	c.rep.SetLength(n + 1);
983	1400	ZZ ai, tmp, scratch;
984	1401	for (long i = 0; i <= n; i++) {
985	1402	ai = aa[i];
986	1403	ZZ& ci = c.rep[i];
987	1404	if (!IsZero(ai)) {
988		/* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / 2m mod p */
	1405	/* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / N mod p */
989	1406	LeftRotate(ai, ai, mr - l - 1, p, mr, scratch);
990	1407	sub(tmp, p, ai);
991	1408	if (NumBits(tmp) >= mr) { /* ci >= (p-1)/2 */

1016	1433	long r = (bound >> l) + 1;
1017	1434	long mr = r << l;
1018	1435
	1436	// sqrt(2) trick
	1437	long l1 = l;
	1438	if (l1 >= 3) {
	1439	long alt_l1 = l-1;
	1440	long alt_r = (bound >> alt_l1) + 1;
	1441	long alt_mr = alt_r << alt_l1;
	1442
	1443	if (alt_mr < mr - mr/8) {
	1444	l1 = alt_l1;
	1445	r = alt_r;
	1446	mr = alt_mr;
	1447	}
	1448	}
	1449
1019	1450	return double(mr + 1)/double(bound);
1020	1451	}
1021	1452

1043	1474
1044	1475	if (nt == 1) {
1045	1476
1046		return (k >= 26 && rat < 1.20) \|\|
	1477	return (k >= 13 && rat < 1.15) \|\|
	1478	(k >= 26 && rat < 1.30) \|\|
1047	1479	(k >= 53 && rat < 1.60) \|\|
1048	1480	(k >= 106 && rat < 1.80) \|\|
1049	1481	(k >= 212 && rat < 2.00);

1125	1557	}
1126	1558	}
1127	1559
1128
1129
1130	1560	void SSSqr(ZZX& c, const ZZX& a)
	1561
1131	1562	{
1132	1563	long na = deg(a);
	1564
1133	1565	if (na <= 0) {
1134	1566	PlainSqr(c, a);
1135	1567	return;

1138	1570	long n = na + na; /* degree of the product */
1139	1571
1140	1572
	1573	/* Choose m and r suitably */
1141	1574	long l = NextPowerOfTwo(n + 1) - 1; /* 2^l <= n < 2^{l+1} */
1142		long m2 = 1L << (l + 1); /* m2 = 2m = 2^{l+1} */
	1575	long N = 1L << (l + 1); /* N = 2^{l+1} */
1143	1576	long bound = 2 + NumBits(na) + 2*MaxBits(a);
	1577	/* Let r be minimal so that mr > bound */
1144	1578	long r = (bound >> l) + 1;
1145	1579	long mr = r << l;
	1580
	1581	// sqrt(2) trick
	1582	long l1 = l;
	1583	if (l1 >= 3) {
	1584	long alt_l1 = l-1;
	1585	long alt_r = (bound >> alt_l1) + 1;
	1586	long alt_mr = alt_r << alt_l1;
	1587
	1588	if (alt_mr < mr - mr/8) {
	1589	l1 = alt_l1;
	1590	r = alt_r;
	1591	mr = alt_mr;
	1592	}
	1593	}
1146	1594
1147	1595	/* p := 2^{mr}+1 */
1148	1596	ZZ p;

1150	1598	LeftShift(p, p, mr);
1151	1599	add(p, p, 1);
1152	1600
	1601	/* Make coefficients of a and b positive */
1153	1602	ZZVec aa;
1154		aa.SetSize(m2, p.size());
	1603	aa.SetSize(N, p.size());
1155	1604
1156	1605	for (long i = 0; i <= deg(a); i++) {
1157	1606	if (sign(a.rep[i]) >= 0) {

1161	1610	}
1162	1611	}
1163	1612
1164
1165		/* 2m-point FFT's mod p */
1166		fft(aa, r, l + 1, p, mr);
1167
1168		/* Pointwise multiplication aa := aa * aa mod p */
	1613	long yn = SS_FFTRoundUp(n+1, l+1);
	1614
	1615	/* N-point FFT's mod p */
	1616	fft_trunc(aa, yn, SS_FFTRoundUp(na+1, l+1), r, l+1, l1+1, p, mr);
	1617
	1618
	1619	/* Pointwise multiplication aa := aa * bb mod p */
1169	1620	// NOTE: we attempt to parallelize this
1170	1621	// Unfortunately, the bulk of the time is spent
1171	1622	// in the FFT, so this is not very effective
1172		NTL_EXEC_RANGE(m2, first, last)
	1623	NTL_EXEC_RANGE(yn, first, last)
1173	1624	ZZ tmp, ai;
1174	1625	for (long i = first; i < last; i++) {
1175	1626	sqr(ai, aa[i]);

1184	1635	aa[i] = ai;
1185	1636	}
1186	1637	NTL_EXEC_RANGE_END
1187
1188		ifft(aa, r, l + 1, p, mr);
1189
1190
1191		/* Retrieve c, dividing by 2m, and subtracting p where necessary */
	1638
	1639	ifft_trunc(aa, yn, r, l+1, l1+1, p, mr);
	1640
	1641	/* Retrieve c, dividing by N, and subtracting p where necessary */
1192	1642	c.rep.SetLength(n + 1);
1193	1643	ZZ ai, tmp, scratch;
1194	1644	for (long i = 0; i <= n; i++) {
1195	1645	ai = aa[i];
1196	1646	ZZ& ci = c.rep[i];
1197	1647	if (!IsZero(ai)) {
1198		/* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / 2m mod p */
	1648	/* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / N mod p */
1199	1649	LeftRotate(ai, ai, mr - l - 1, p, mr, scratch);
1200	1650	sub(tmp, p, ai);
1201	1651	if (NumBits(tmp) >= mr) { /* ci >= (p-1)/2 */

1208	1658	clear(ci);
1209	1659	}
1210	1660	}
	1661
	1662
1211	1663
1212	1664	void sqr(ZZX& c, const ZZX& a)
1213	1665	{

-1

src/ZZ_p.cpp less more

184	184	ptr = ZZ_pInfo_stg;
185	185	}
186	186
187
188	187	void ZZ_pContext::restore() const
189	188	{
190	189	if (ZZ_pInfo == ptr.get()) return;

+377

-56

src/ZZ_pEX.cpp less more

1887	1887	}
1888	1888	}
1889	1889
1890		void GCD(ZZ_pEX& x, const ZZ_pEX& a, const ZZ_pEX& b)
	1890	void PlainGCD(ZZ_pEX& x, const ZZ_pEX& a, const ZZ_pEX& b)
1891	1891	{
1892	1892	ZZ_pE t;
1893	1893

1922	1922	mul(x, x, t);
1923	1923	}
1924	1924
1925
1926
1927
	1925	class _NTL_ZZ_pEXMatrix {
	1926	private:
	1927
	1928	_NTL_ZZ_pEXMatrix(const _NTL_ZZ_pEXMatrix&); // disable
	1929	ZZ_pEX elts[2][2];
	1930
	1931	public:
	1932
	1933	_NTL_ZZ_pEXMatrix() { }
	1934	~_NTL_ZZ_pEXMatrix() { }
	1935
	1936	void operator=(const _NTL_ZZ_pEXMatrix&);
	1937	ZZ_pEX& operator() (long i, long j) { return elts[i][j]; }
	1938	const ZZ_pEX& operator() (long i, long j) const { return elts[i][j]; }
	1939	};
	1940
	1941
	1942	void _NTL_ZZ_pEXMatrix::operator=(const _NTL_ZZ_pEXMatrix& M)
	1943	{
	1944	elts[0][0] = M.elts[0][0];
	1945	elts[0][1] = M.elts[0][1];
	1946	elts[1][0] = M.elts[1][0];
	1947	elts[1][1] = M.elts[1][1];
	1948	}
	1949
	1950
	1951	static
	1952	void mul(ZZ_pEX& U, ZZ_pEX& V, const _NTL_ZZ_pEXMatrix& M)
	1953	// (U, V)^T = M*(U, V)^T
	1954	{
	1955	ZZ_pEX t1, t2, t3;
	1956
	1957	mul(t1, M(0,0), U);
	1958	mul(t2, M(0,1), V);
	1959	add(t3, t1, t2);
	1960	mul(t1, M(1,0), U);
	1961	mul(t2, M(1,1), V);
	1962	add(V, t1, t2);
	1963	U = t3;
	1964	}
	1965
	1966
	1967	static
	1968	void mul(_NTL_ZZ_pEXMatrix& A, _NTL_ZZ_pEXMatrix& B, _NTL_ZZ_pEXMatrix& C)
	1969	// A = B*C, B and C are destroyed
	1970	{
	1971	ZZ_pEX t1, t2;
	1972
	1973	mul(t1, B(0,0), C(0,0));
	1974	mul(t2, B(0,1), C(1,0));
	1975	add(A(0,0), t1, t2);
	1976
	1977	mul(t1, B(1,0), C(0,0));
	1978	mul(t2, B(1,1), C(1,0));
	1979	add(A(1,0), t1, t2);
	1980
	1981	mul(t1, B(0,0), C(0,1));
	1982	mul(t2, B(0,1), C(1,1));
	1983	add(A(0,1), t1, t2);
	1984
	1985	mul(t1, B(1,0), C(0,1));
	1986	mul(t2, B(1,1), C(1,1));
	1987	add(A(1,1), t1, t2);
	1988
	1989	long i, j;
	1990	for (i = 0; i < 2; i++) {
	1991	for (j = 0; j < 2; j++) {
	1992	B(i,j).kill();
	1993	C(i,j).kill();
	1994	}
	1995	}
	1996	}
	1997
	1998
	1999	void IterHalfGCD(_NTL_ZZ_pEXMatrix& M_out, ZZ_pEX& U, ZZ_pEX& V, long d_red)
	2000	{
	2001	M_out(0,0).SetMaxLength(d_red);
	2002	M_out(0,1).SetMaxLength(d_red);
	2003	M_out(1,0).SetMaxLength(d_red);
	2004	M_out(1,1).SetMaxLength(d_red);
	2005
	2006	set(M_out(0,0)); clear(M_out(0,1));
	2007	clear(M_out(1,0)); set(M_out(1,1));
	2008
	2009	long goal = deg(U) - d_red;
	2010
	2011	if (deg(V) <= goal)
	2012	return;
	2013
	2014	ZZ_pEX Q, t(INIT_SIZE, d_red);
	2015
	2016	while (deg(V) > goal) {
	2017	PlainDivRem(Q, U, U, V);
	2018	swap(U, V);
	2019
	2020	mul(t, Q, M_out(1,0));
	2021	sub(t, M_out(0,0), t);
	2022	M_out(0,0) = M_out(1,0);
	2023	M_out(1,0) = t;
	2024
	2025	mul(t, Q, M_out(1,1));
	2026	sub(t, M_out(0,1), t);
	2027	M_out(0,1) = M_out(1,1);
	2028	M_out(1,1) = t;
	2029	}
	2030	}
	2031
	2032
	2033
	2034	#define NTL_ZZ_pEX_HalfGCD_CROSSOVER (25)
	2035	#define NTL_ZZ_pEX_GCD_CROSSOVER (275)
	2036
	2037
	2038	void HalfGCD(_NTL_ZZ_pEXMatrix& M_out, const ZZ_pEX& U, const ZZ_pEX& V, long d_red)
	2039	{
	2040	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2041	set(M_out(0,0)); clear(M_out(0,1));
	2042	clear(M_out(1,0)); set(M_out(1,1));
	2043
	2044	return;
	2045	}
	2046
	2047
	2048	long n = deg(U) - 2*d_red + 2;
	2049	if (n < 0) n = 0;
	2050
	2051	ZZ_pEX U1, V1;
	2052
	2053	RightShift(U1, U, n);
	2054	RightShift(V1, V, n);
	2055
	2056	if (d_red <= NTL_ZZ_pEX_HalfGCD_CROSSOVER) {
	2057	IterHalfGCD(M_out, U1, V1, d_red);
	2058	return;
	2059	}
	2060
	2061	long d1 = (d_red + 1)/2;
	2062	if (d1 < 1) d1 = 1;
	2063	if (d1 >= d_red) d1 = d_red - 1;
	2064
	2065	_NTL_ZZ_pEXMatrix M1;
	2066
	2067	HalfGCD(M1, U1, V1, d1);
	2068	mul(U1, V1, M1);
	2069
	2070	long d2 = deg(V1) - deg(U) + n + d_red;
	2071
	2072	if (IsZero(V1) \|\| d2 <= 0) {
	2073	M_out = M1;
	2074	return;
	2075	}
	2076
	2077
	2078	ZZ_pEX Q;
	2079	_NTL_ZZ_pEXMatrix M2;
	2080
	2081	DivRem(Q, U1, U1, V1);
	2082	swap(U1, V1);
	2083
	2084	HalfGCD(M2, U1, V1, d2);
	2085
	2086	ZZ_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	2087
	2088	mul(t, Q, M1(1,0));
	2089	sub(t, M1(0,0), t);
	2090	swap(M1(0,0), M1(1,0));
	2091	swap(M1(1,0), t);
	2092
	2093	t.kill();
	2094
	2095	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	2096
	2097	mul(t, Q, M1(1,1));
	2098	sub(t, M1(0,1), t);
	2099	swap(M1(0,1), M1(1,1));
	2100	swap(M1(1,1), t);
	2101
	2102	t.kill();
	2103
	2104	mul(M_out, M2, M1);
	2105	}
	2106
	2107
	2108
	2109
	2110	void XHalfGCD(_NTL_ZZ_pEXMatrix& M_out, ZZ_pEX& U, ZZ_pEX& V, long d_red)
	2111	{
	2112	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2113	set(M_out(0,0)); clear(M_out(0,1));
	2114	clear(M_out(1,0)); set(M_out(1,1));
	2115
	2116	return;
	2117	}
	2118
	2119	long du = deg(U);
	2120
	2121	if (d_red <= NTL_ZZ_pEX_HalfGCD_CROSSOVER) {
	2122	IterHalfGCD(M_out, U, V, d_red);
	2123	return;
	2124	}
	2125
	2126	long d1 = (d_red + 1)/2;
	2127	if (d1 < 1) d1 = 1;
	2128	if (d1 >= d_red) d1 = d_red - 1;
	2129
	2130	//ZZ_pXMatrix M1;
	2131	_NTL_ZZ_pEXMatrix M1;
	2132
	2133	HalfGCD(M1, U, V, d1);
	2134	mul(U, V, M1);
	2135
	2136	long d2 = deg(V) - du + d_red;
	2137
	2138	if (IsZero(V) \|\| d2 <= 0) {
	2139	M_out = M1;
	2140	return;
	2141	}
	2142
	2143
	2144	ZZ_pEX Q;
	2145	_NTL_ZZ_pEXMatrix M2;
	2146
	2147	DivRem(Q, U, U, V);
	2148	swap(U, V);
	2149
	2150	XHalfGCD(M2, U, V, d2);
	2151
	2152	ZZ_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	2153
	2154	mul(t, Q, M1(1,0));
	2155	sub(t, M1(0,0), t);
	2156	swap(M1(0,0), M1(1,0));
	2157	swap(M1(1,0), t);
	2158
	2159	t.kill();
	2160
	2161	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	2162
	2163	mul(t, Q, M1(1,1));
	2164	sub(t, M1(0,1), t);
	2165	swap(M1(0,1), M1(1,1));
	2166	swap(M1(1,1), t);
	2167
	2168	t.kill();
	2169
	2170	mul(M_out, M2, M1);
	2171	}
	2172
	2173	void HalfGCD(ZZ_pEX& U, ZZ_pEX& V)
	2174	{
	2175	long d_red = (deg(U)+1)/2;
	2176
	2177	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2178	return;
	2179	}
	2180
	2181	long du = deg(U);
	2182
	2183
	2184	long d1 = (d_red + 1)/2;
	2185	if (d1 < 1) d1 = 1;
	2186	if (d1 >= d_red) d1 = d_red - 1;
	2187
	2188	_NTL_ZZ_pEXMatrix M1;
	2189
	2190	HalfGCD(M1, U, V, d1);
	2191	mul(U, V, M1);
	2192
	2193	long d2 = deg(V) - du + d_red;
	2194
	2195	if (IsZero(V) \|\| d2 <= 0) {
	2196	return;
	2197	}
	2198
	2199	M1(0,0).kill();
	2200	M1(0,1).kill();
	2201	M1(1,0).kill();
	2202	M1(1,1).kill();
	2203
	2204
	2205	ZZ_pEX Q;
	2206
	2207	DivRem(Q, U, U, V);
	2208	swap(U, V);
	2209
	2210	HalfGCD(M1, U, V, d2);
	2211
	2212	mul(U, V, M1);
	2213	}
	2214
	2215
	2216	void GCD(ZZ_pEX& d, const ZZ_pEX& u, const ZZ_pEX& v)
	2217	{
	2218	ZZ_pEX u1, v1;
	2219
	2220	u1 = u;
	2221	v1 = v;
	2222
	2223	if (deg(u1) == deg(v1)) {
	2224	if (IsZero(u1)) {
	2225	clear(d);
	2226	return;
	2227	}
	2228
	2229	rem(v1, v1, u1);
	2230	}
	2231	else if (deg(u1) < deg(v1)) {
	2232	swap(u1, v1);
	2233	}
	2234
	2235	// deg(u1) > deg(v1)
	2236
	2237	while (deg(u1) > NTL_ZZ_pEX_GCD_CROSSOVER && !IsZero(v1)) {
	2238	HalfGCD(u1, v1);
	2239
	2240	if (!IsZero(v1)) {
	2241	rem(u1, u1, v1);
	2242	swap(u1, v1);
	2243	}
	2244	}
	2245
	2246	PlainGCD(d, u1, v1);
	2247	}
	2248
1928	2249
1929	2250	void XGCD(ZZ_pEX& d, ZZ_pEX& s, ZZ_pEX& t, const ZZ_pEX& a, const ZZ_pEX& b)
1930	2251	{
1931		ZZ_pE z;
1932
1933
1934		if (IsZero(b)) {
	2252	ZZ_pE w;
	2253
	2254	if (IsZero(a) && IsZero(b)) {
	2255	clear(d);
1935	2256	set(s);
1936	2257	clear(t);
1937		d = a;
1938		}
1939		else if (IsZero(a)) {
1940		clear(s);
1941		set(t);
1942		d = b;
1943		}
1944		else {
1945		long e = max(deg(a), deg(b)) + 1;
1946
1947		ZZ_pEX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1948		u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1949		u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1950		u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1951
1952
1953		set(u1); clear(v1);
1954		clear(u2); set(v2);
1955		u = a; v = b;
1956
1957		do {
1958		DivRem(q, u, u, v);
1959		swap(u, v);
1960		u0 = u2;
1961		v0 = v2;
1962		mul(temp, q, u2);
1963		sub(u2, u1, temp);
1964		mul(temp, q, v2);
1965		sub(v2, v1, temp);
1966		u1 = u0;
1967		v1 = v0;
1968		} while (!IsZero(v));
1969
1970		d = u;
1971		s = u1;
1972		t = v1;
1973		}
1974
1975		if (IsZero(d)) return;
1976		if (IsOne(LeadCoeff(d))) return;
1977
1978		/* make gcd monic */
1979
1980		inv(z, LeadCoeff(d));
1981		mul(d, d, z);
1982		mul(s, s, z);
1983		mul(t, t, z);
1984		}
	2258	return;
	2259	}
	2260
	2261	ZZ_pEX U, V, Q;
	2262
	2263	U = a;
	2264	V = b;
	2265
	2266	long flag = 0;
	2267
	2268	if (deg(U) == deg(V)) {
	2269	DivRem(Q, U, U, V);
	2270	swap(U, V);
	2271	flag = 1;
	2272	}
	2273	else if (deg(U) < deg(V)) {
	2274	swap(U, V);
	2275	flag = 2;
	2276	}
	2277
	2278	_NTL_ZZ_pEXMatrix M;
	2279
	2280	XHalfGCD(M, U, V, deg(U)+1);
	2281
	2282	d = U;
	2283
	2284	if (flag == 0) {
	2285	s = M(0,0);
	2286	t = M(0,1);
	2287	}
	2288	else if (flag == 1) {
	2289	s = M(0,1);
	2290	mul(t, Q, M(0,1));
	2291	sub(t, M(0,0), t);
	2292	}
	2293	else { /* flag == 2 */
	2294	s = M(0,1);
	2295	t = M(0,0);
	2296	}
	2297
	2298	// normalize
	2299
	2300	inv(w, LeadCoeff(d));
	2301	mul(d, d, w);
	2302	mul(s, s, w);
	2303	mul(t, t, w);
	2304	}
	2305
1985	2306
1986	2307	void IterBuild(ZZ_pE* a, long n)
1987	2308	{

+108

-0

src/ZZ_pEXGCDTest.cpp less more

	0	#include <NTL/ZZ_pXFactoring.h>
	1	#include <NTL/ZZ_pEX.h>
	2
	3	NTL_CLIENT
	4
	5
	6
	7	void test(ZZ_pX& P, ZZ_pEX& f, ZZ_pEX& g, ZZ_pEX& h, ZZ_pEX& hx, ZZ_pEX& s, ZZ_pEX& t)
	8	{
	9	/* P is the polynomial of the extension
	10	* f and g the polynomials
	11	* h the gcd
	12	* hx the gcd obtained using XGCD
	13	* s, t are Bezout coefficients hx=fs+gt
	14	*/
	15	ZZ_pEX htest,rf,rg;
	16
	17	if (h!=hx){
	18	cout << P << "\n" << f << "\n" << g << "\n";
	19	Error("different gcd:\n");
	20	}
	21
	22	if (max(deg(f), deg(g)) > 0 \|\| min(deg(f), deg(g)) >= 0) {
	23	if (deg(s) >= deg(g) \|\| deg(t) >= deg(f)) {
	24	cout << P << "\n" << f << "\n" << g << "\n";
	25	Error("degree bounds at fault:\n");
	26	}
	27	}
	28
	29
	30	mul(s,s,f);
	31	mul(t,t,g);
	32	add(htest,t,s);
	33	if (h!=htest){
	34	cout << P << "\n" << f << "\n" << g << "\n";
	35	Error("xgcd at fault:\n");
	36	}
	37	if (!IsZero(h)){
	38	rem(rf,f,h);
	39	rem(rg,f,h);
	40	if ((!IsZero(rf))\|\|(!IsZero(rg))){
	41	cout << P << "\n" << f << "\n" << g << "\n";
	42	Error("not a common divisor\n");
	43	}
	44	}else{
	45	if (!IsZero(f) && !IsZero(g)){
	46	cout << "debug:\n";
	47	cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
	48	Error("ooops:\n");
	49	}
	50	}
	51	}
	52
	53
	54	int main()
	55	{
	56
	57	ZZ prime = conv<ZZ>("340282366920938463463374607619092730237");
	58
	59	ZZ_p::init(prime);
	60
	61	ZZ_pX P;
	62
	63	BuildIrred(P, 3);
	64
	65	ZZ_pE::init(P);
	66
	67	for (long i = 0; i < 400; i++) {
	68	if (i%10 == 0) cerr << ".";
	69	ZZ_pEX f,g,h,s,t,hx;
	70
	71	long deg_h;
	72	if (RandomBnd(2))
	73	deg_h = RandomBnd(10)+1;
	74	else
	75	deg_h = RandomBnd(500)+1;
	76
	77	random(h, deg_h);
	78	SetCoeff(h, deg_h);
	79
	80	long deg_f;
	81	if (RandomBnd(2))
	82	deg_f = RandomBnd(10)+1;
	83	else
	84	deg_f = RandomBnd(1000)+1;
	85
	86	random(f, deg_f);
	87	f *= h;
	88
	89	long deg_g;
	90	if (RandomBnd(2))
	91	deg_g = RandomBnd(10)+1;
	92	else
	93	deg_g = RandomBnd(1000)+1;
	94
	95	random(g, deg_g);
	96	g *= h;
	97
	98	h = 0;
	99
	100	GCD(h, f, g);
	101	XGCD(hx, s, t, f, g);
	102	test(P, f, g, h, hx, s, t);
	103	}
	104
	105	cerr << "\n";
	106
	107	}

+223

-152

src/ZZ_pX.cpp less more

0	0	#include <NTL/ZZ_pX.h>
1	1	#include <NTL/BasicThreadPool.h>
	2	#include <NTL/FFT_impl.h>
2	3
3	4
4	5	// The mul & sqr routines use routines from ZZX,

12	13
13	14	#endif
14	15
	16	NTL_START_IMPL
15	17
16	18
17	19	#if (defined(NTL_GMP_LIP))

20	22	#define KARX 80
21	23	#endif
22	24
23
24
25		NTL_START_IMPL
26
	25	#define PAR_THRESH (4000.0)
	26
	27	#define PAR_THRESH1 (20000.0)
	28	// Higher threshold for cheaper operations
	29
	30	static inline bool BelowThresh(long n)
	31	{
	32	return double(n)*double(ZZ_p::ModulusSize()) < PAR_THRESH;
	33	}
	34
	35	static inline bool BelowThresh1(long n)
	36	{
	37	return double(n)*double(ZZ_p::ModulusSize()) < PAR_THRESH1;
	38	}
27	39
28	40
29	41

475	487
476	488	if ( nt == 1 && (
477	489
478		(k >= 106 && rat < 1.30) \|\|
	490	(k >= 106 && rat < 1.50) \|\|
479	491	(k >= 212 && rat < 1.75)
480	492
481	493	)) {

532	544
533	545	if ( nt == 1 && (
534	546
535		(k >= 53 && rat < 1.10) \|\|
	547	(k >= 53 && rat < 1.20) \|\|
536	548	(k >= 106 && rat < 1.30) \|\|
537	549	(k >= 212 && rat < 1.75)
538	550

1019	1031	{
1020	1032	BasicThreadPool *pool = GetThreadPool();
1021	1033
1022		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1034	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(n)) {
1023	1035	basic_MulAux(xp, ap, t, n);
1024	1036	return;
1025	1037	}

1238	1250
1239	1251	BasicThreadPool *pool = GetThreadPool();
1240	1252
1241		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| hh == aa) {
	1253	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| hh == aa \|\| BelowThresh(n)) {
1242	1254	// Careful! can't parallelize if hh == aa
1243	1255	basic_MulByXModAux1(n, hh, aa, ff, z);
1244	1256	return;

1382	1394
1383	1395	if (R.k < 0) {
1384	1396	k = -1;
	1397	len = 0;
1385	1398	return *this;
1386	1399	}
1387	1400
1388	1401	DoSetSize(R.k, R.NumPrimes);
1389		long i, j, n;
1390
1391		n = 1L << k;
	1402	len = R.len;
	1403
	1404	long i, j;
1392	1405
1393	1406	for (i = 0; i < NumPrimes; i++)
1394		for (j = 0; j < n; j++)
	1407	for (j = 0; j < len; j++)
1395	1408	tbl[i][j] = R.tbl[i][j];
1396	1409
1397	1410	return *this;

1482	1495
1483	1496
1484	1497
1485		NTL_TBDECL(ToFFTRep)(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
	1498	NTL_TBDECL(ToFFTRep_trunc)(FFTRep& y, const ZZ_pX& x, long k, long len, long lo, long hi)
1486	1499	// computes an n = 2^k point convolution.
1487	1500	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1488	1501	{

1506	1519	hi = min(hi, deg(x));
1507	1520
1508	1521	y.SetSize(k);
1509
1510	1522	n = 1L << k;
1511	1523
	1524	y.len = len = FFTRoundUp(len, k);
	1525
1512	1526	m = max(hi-lo + 1, 0);
	1527	long ilen = FFTRoundUp(m, k);
1513	1528
1514	1529	const ZZ_p *xx = x.rep.elts();
1515	1530

1521	1536	}
1522	1537	}
1523	1538
1524		if (n > m) {
	1539	if (ilen > m) {
1525	1540	for (i = 0; i < nprimes; i++) {
1526	1541	long *yp = &y.tbl[i][0];
1527		for (j = m; j < n; j++) {
	1542	for (j = m; j < ilen; j++) {
1528	1543	yp[j] = 0;
1529	1544	}
1530	1545	}

1549	1564
1550	1565	for (i = 0; i < nprimes; i++) {
1551	1566	long *yp = &y.tbl[i][0];
1552		FFTFwd(yp, yp, k, i);
	1567	FFTFwd_trunc(yp, yp, k, i, len, ilen);
1553	1568	}
1554	1569	}
1555	1570
1556	1571
1557	1572	#ifdef NTL_THREAD_BOOST
1558	1573
1559		void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
	1574	void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len, long lo, long hi)
1560	1575	// computes an n = 2^k point convolution.
1561	1576	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1562	1577	{
1563	1578	BasicThreadPool *pool = GetThreadPool();
1564	1579
1565		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
1566		basic_ToFFTRep(y, x, k, lo, hi);
	1580	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << k)) {
	1581	basic_ToFFTRep_trunc(y, x, k, len, lo, hi);
1567	1582	return;
1568	1583	}
1569	1584

1585	1600	hi = min(hi, deg(x));
1586	1601
1587	1602	y.SetSize(k);
1588
1589	1603	n = 1L << k;
1590	1604
	1605	y.len = len = FFTRoundUp(len, k);
	1606
1591	1607	m = max(hi-lo + 1, 0);
	1608	long ilen = FFTRoundUp(m, k);
1592	1609
1593	1610	const ZZ_p *xx = x.rep.elts();
1594	1611

1645	1662	// cache performance. I don't really know if that is an issue.
1646	1663
1647	1664	pool->exec_range(nprimes,
1648		[&y, m, n, k](long first, long last) {
	1665	[&y, m, n, k, len, ilen](long first, long last) {
1649	1666	for (long i = first; i < last; i++) {
1650	1667	long *yp = &y.tbl[i][0];
1651		for (long j = m; j < n; j++) yp[j] = 0;
1652		FFTFwd(yp, yp, k, i);
	1668	for (long j = m; j < ilen; j++) yp[j] = 0;
	1669	FFTFwd_trunc(yp, yp, k, i, len, ilen);
1653	1670	}
1654	1671	} );
1655	1672	}

1686	1703	y.SetSize(k);
1687	1704
1688	1705	n = 1L << k;
	1706	y.len = n;
1689	1707
1690	1708	m = max(hi-lo + 1, 0);
1691	1709

1715	1733
1716	1734	for (i = 0; i < nprimes; i++) {
1717	1735	long *yp = &y.tbl[i][0];
1718		FFTRev1(yp, yp, k, i);
	1736	FFTRev1_trans(yp, yp, k, i);
1719	1737	}
1720	1738
1721	1739	}

1732	1750	{
1733	1751	BasicThreadPool *pool = GetThreadPool();
1734	1752
1735		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1753	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << k)) {
1736	1754	basic_RevToFFTRep(y, x, k, lo, hi, offset);
1737	1755	return;
1738	1756	}

1754	1772	y.SetSize(k);
1755	1773
1756	1774	n = 1L << k;
	1775	y.len = n;
1757	1776
1758	1777	m = max(hi-lo + 1, 0);
1759	1778

1803	1822	[&y, k](long first, long last) {
1804	1823	for (long i = first; i < last; i++) {
1805	1824	long *yp = &y.tbl[i][0];
1806		FFTRev1(yp, yp, k, i);
	1825	FFTRev1_trans(yp, yp, k, i);
1807	1826	}
1808	1827	} );
1809	1828

1837	1856	k = y.k;
1838	1857	n = (1L << k);
1839	1858
1840
1841		for (i = 0; i < nprimes; i++) {
1842		long *yp = &y.tbl[i][0];
1843		FFTRev1(yp, yp, k, i);
1844		}
1845
1846	1859	hi = min(hi, n-1);
1847	1860	l = hi-lo+1;
1848	1861	l = max(l, 0);
	1862
	1863	long len = y.len;
	1864	if (len <= hi) LogicError("FromFFTRep: bad len");
	1865
	1866
	1867	for (i = 0; i < nprimes; i++) {
	1868	long *yp = &y.tbl[i][0];
	1869	FFTRev1_trunc(yp, yp, k, i, len);
	1870	}
	1871
1849	1872	x.rep.SetLength(l);
	1873
1850	1874
1851	1875	for (j = 0; j < l; j++) {
1852	1876	for (i = 0; i < nprimes; i++)

1869	1893	{
1870	1894	BasicThreadPool *pool = GetThreadPool();
1871	1895
1872		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1896	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << y.k)) {
1873	1897	basic_FromFFTRep(x, y, lo, hi);
1874	1898	return;
1875	1899	}

1881	1905
1882	1906	k = y.k;
1883	1907	n = (1L << k);
1884
1885
1886		pool->exec_range(nprimes,
1887		[&y, k](long first, long last) {
1888		for (long i = first; i < last; i++) {
1889		long *yp = &y.tbl[i][0];
1890		FFTRev1(yp, yp, k, i);
1891		}
1892		} );
1893	1908
1894	1909	hi = min(hi, n-1);
1895	1910	l = hi-lo+1;
1896	1911	l = max(l, 0);
	1912
	1913	long len = y.len;
	1914	if (len <= hi) LogicError("FromFFTRep: bad len");
	1915
	1916	pool->exec_range(nprimes,
	1917	[&y, k, len](long first, long last) {
	1918	for (long i = first; i < last; i++) {
	1919	long *yp = &y.tbl[i][0];
	1920	FFTRev1_trunc(yp, yp, k, i, len);
	1921	}
	1922	} );
	1923
	1924
1897	1925	x.rep.SetLength(l);
1898	1926	ZZ_p *xx = x.rep.elts();
1899	1927

1950	1978	k = y.k;
1951	1979	n = (1L << k);
1952	1980
	1981	if (y.len != n) LogicError("RevFromFFTRep: bad len");
	1982
	1983
1953	1984	long nprimes = FFTInfo->NumPrimes;
1954	1985	t.SetLength(nprimes);
1955	1986
1956	1987	for (i = 0; i < nprimes; i++) {
1957	1988	long *yp = &y.tbl[i][0];
1958		FFTFwd(yp, yp, k, i);
	1989	FFTFwd_trans(yp, yp, k, i);
1959	1990	}
1960	1991
1961	1992	hi = min(hi, n-1);

1978	2009	{
1979	2010	BasicThreadPool *pool = GetThreadPool();
1980	2011
1981		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2012	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << y.k)) {
1982	2013	basic_RevFromFFTRep(x, y, lo, hi);
1983	2014	return;
1984	2015	}

1990	2021
1991	2022	k = y.k;
1992	2023	n = (1L << k);
	2024
	2025	if (y.len != n) LogicError("RevFromFFTRep: bad len");
1993	2026
1994	2027
1995	2028	pool->exec_range(nprimes,
1996	2029	[&y, k](long first, long last) {
1997	2030	for (long i = first; i < last; i++) {
1998	2031	long *yp = &y.tbl[i][0];
1999		FFTFwd(yp, yp, k, i);
	2032	FFTFwd_trans(yp, yp, k, i);
2000	2033	}
2001	2034	} );
2002	2035

2056	2089	k = y.k;
2057	2090	n = (1L << k);
2058	2091
	2092	hi = min(hi, n-1);
	2093	l = hi-lo+1;
	2094	l = max(l, 0);
	2095
	2096	long len = y.len;
	2097	if (len <= hi) LogicError("FromFFTRep: bad len");
	2098
2059	2099	z.SetSize(k);
2060	2100
2061	2101	for (i = 0; i < nprimes; i++) {
2062	2102	long *zp = &z.tbl[i][0];
2063	2103	const long *yp = &y.tbl[i][0];
2064	2104
2065		FFTRev1(zp, yp, k, i);
2066		}
	2105	FFTRev1_trunc(zp, yp, k, i, len);
	2106	}
	2107
	2108	x.rep.SetLength(l);
	2109
	2110	for (j = 0; j < l; j++) {
	2111	for (i = 0; i < nprimes; i++)
	2112	t[i] = z.tbl[i][j+lo];
	2113
	2114	FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
	2115	}
	2116
	2117	x.normalize();
	2118	}
	2119
	2120	#ifdef NTL_THREAD_BOOST
	2121
	2122	void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
	2123
	2124	// converts from FFT-representation to coefficient representation
	2125	// only the coefficients lo..hi are computed
	2126
	2127
	2128	{
	2129	BasicThreadPool *pool = GetThreadPool();
	2130
	2131	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << y.k)) {
	2132	basic_NDFromFFTRep(x, y, lo, hi, z);
	2133	return;
	2134	}
	2135	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2136
	2137	long k, n, l;
	2138
	2139	long nprimes = FFTInfo->NumPrimes;
	2140
	2141	k = y.k;
	2142	n = (1L << k);
2067	2143
2068	2144	hi = min(hi, n-1);
2069	2145	l = hi-lo+1;
2070	2146	l = max(l, 0);
2071		x.rep.SetLength(l);
2072
2073		for (j = 0; j < l; j++) {
2074		for (i = 0; i < nprimes; i++)
2075		t[i] = z.tbl[i][j+lo];
2076
2077		FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
2078		}
2079
2080		x.normalize();
2081		}
2082
2083		#ifdef NTL_THREAD_BOOST
2084
2085		void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
2086
2087		// converts from FFT-representation to coefficient representation
2088		// only the coefficients lo..hi are computed
2089
2090
2091		{
2092		BasicThreadPool *pool = GetThreadPool();
2093
2094		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
2095		basic_NDFromFFTRep(x, y, lo, hi, z);
2096		return;
2097		}
2098		const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2099
2100		long k, n, l;
2101
2102		long nprimes = FFTInfo->NumPrimes;
2103
2104		k = y.k;
2105		n = (1L << k);
	2147
	2148	long len = y.len;
	2149	if (len <= hi) LogicError("FromFFTRep: bad len");
2106	2150
2107	2151	z.SetSize(k);
2108	2152
2109	2153	pool->exec_range(nprimes,
2110		[&y, &z, k](long first, long last) {
	2154	[&y, &z, k, len](long first, long last) {
2111	2155	for (long i = first; i < last; i++) {
2112	2156	long *zp = &z.tbl[i][0];
2113	2157	const long *yp = &y.tbl[i][0];
2114		FFTRev1(zp, yp, k, i);
	2158	FFTRev1_trunc(zp, yp, k, i, len);
2115	2159	}
2116	2160	} );
2117	2161
2118		hi = min(hi, n-1);
2119		l = hi-lo+1;
2120		l = max(l, 0);
2121	2162	x.rep.SetLength(l);
2122	2163	ZZ_p *xx = x.rep.elts();
2123	2164

2156	2197	NDFromFFTRep(x, y, lo, hi, z);
2157	2198	}
2158	2199
	2200
	2201
2159	2202	NTL_TBDECL(FromFFTRep)(ZZ_p* x, FFTRep& y, long lo, long hi)
2160	2203
2161	2204	// converts from FFT-representation to coefficient representation

2173	2216
2174	2217	k = y.k;
2175	2218	n = (1L << k);
	2219
	2220	//if (y.len <= min(hi, n-1)) LogicError("FromFFTRep: bad len");
	2221	if (y.len != n) LogicError("FromFFTRep: bad len");
2176	2222
2177	2223	long nprimes = FFTInfo->NumPrimes;
2178	2224	t.SetLength(nprimes);

2206	2252	{
2207	2253	BasicThreadPool *pool = GetThreadPool();
2208	2254
2209		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2255	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << y.k)) {
2210	2256	basic_FromFFTRep(x, y, lo, hi);
2211	2257	return;
2212	2258	}

2218	2264
2219	2265	k = y.k;
2220	2266	n = (1L << k);
	2267
	2268	//if (y.len <= min(hi, n-1)) LogicError("FromFFTRep: bad len");
	2269	if (y.len != n) LogicError("FromFFTRep: bad len");
2221	2270
2222	2271	long nprimes = FFTInfo->NumPrimes;
2223	2272

2263	2312	#endif
2264	2313
2265	2314
	2315
2266	2316	NTL_TBDECL(mul)(FFTRep& z, const FFTRep& x, const FFTRep& y)
2267	2317	{
2268	2318	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2269	2319
2270		long k, n, i, j;
	2320	long k, i, j;
2271	2321
2272	2322	if (x.k != y.k) LogicError("FFT rep mismatch");
2273	2323
2274	2324	k = x.k;
2275		n = 1L << k;
2276	2325
2277	2326	z.SetSize(k);
	2327
	2328	long len = z.len = min(x.len, y.len);
2278	2329
2279	2330	long nprimes = FFTInfo->NumPrimes;
2280	2331

2285	2336	long q = GetFFTPrime(i);
2286	2337	mulmod_t qinv = GetFFTPrimeInv(i);
2287	2338
2288		for (j = 0; j < n; j++)
	2339	for (j = 0; j < len; j++)
2289	2340	zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
2290	2341	}
2291	2342

2298	2349	{
2299	2350	BasicThreadPool *pool = GetThreadPool();
2300	2351
2301		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2352	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh1(1L << x.k)) {
2302	2353	basic_mul(z, x, y);
2303	2354	return;
2304	2355	}
2305	2356	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2306	2357
2307		long k, n;
	2358	long k;
2308	2359
2309	2360	if (x.k != y.k) LogicError("FFT rep mismatch");
2310	2361
2311	2362	k = x.k;
2312		n = 1L << k;
2313	2363
2314	2364	z.SetSize(k);
2315	2365
	2366	long len = z.len = min(x.len, y.len);
	2367
2316	2368	long nprimes = FFTInfo->NumPrimes;
2317	2369
2318	2370	pool->exec_range(nprimes,
2319		[&x, &y, &z, n](long first, long last) {
	2371	[&x, &y, &z, len](long first, long last) {
2320	2372	for (long i = first; i < last; i++) {
2321	2373	long *zp = &z.tbl[i][0];
2322	2374	const long *xp = &x.tbl[i][0];

2324	2376	long q = GetFFTPrime(i);
2325	2377	mulmod_t qinv = GetFFTPrimeInv(i);
2326	2378
2327		for (long j = 0; j < n; j++)
	2379	for (long j = 0; j < len; j++)
2328	2380	zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
2329	2381	}
2330	2382	} );

2339	2391	{
2340	2392	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2341	2393
2342		long k, n, i, j;
	2394	long k, i, j;
2343	2395
2344	2396	if (x.k != y.k) LogicError("FFT rep mismatch");
2345	2397
2346	2398	k = x.k;
2347		n = 1L << k;
2348	2399
2349	2400	z.SetSize(k);
	2401
	2402	long len = z.len = min(x.len, y.len);
2350	2403
2351	2404	long nprimes = FFTInfo->NumPrimes;
2352	2405

2356	2409	const long *yp = &y.tbl[i][0];
2357	2410	long q = GetFFTPrime(i);
2358	2411
2359		for (j = 0; j < n; j++)
	2412	for (j = 0; j < len; j++)
2360	2413	zp[j] = SubMod(xp[j], yp[j], q);
2361	2414	}
2362	2415

2369	2422	{
2370	2423	BasicThreadPool *pool = GetThreadPool();
2371	2424
2372		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2425	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh1(1L << x.k)) {
2373	2426	basic_sub(z, x, y);
2374	2427	return;
2375	2428	}
2376	2429	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2377	2430
2378		long k, n;
	2431	long k;
2379	2432
2380	2433	if (x.k != y.k) LogicError("FFT rep mismatch");
2381	2434
2382	2435	k = x.k;
2383		n = 1L << k;
2384	2436
2385	2437	z.SetSize(k);
2386	2438
	2439	long len = z.len = min(x.len, y.len);
	2440
2387	2441	long nprimes = FFTInfo->NumPrimes;
2388	2442
2389	2443	pool->exec_range(nprimes,
2390		[&x, &y, &z, n](long first, long last) {
	2444	[&x, &y, &z, len](long first, long last) {
2391	2445	for (long i = first; i < last; i++) {
2392	2446	long *zp = &z.tbl[i][0];
2393	2447	const long *xp = &x.tbl[i][0];
2394	2448	const long *yp = &y.tbl[i][0];
2395	2449	long q = GetFFTPrime(i);
2396	2450
2397		for (long j = 0; j < n; j++)
	2451	for (long j = 0; j < len; j++)
2398	2452	zp[j] = SubMod(xp[j], yp[j], q);
2399	2453	}
2400	2454	} );

2409	2463	{
2410	2464	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2411	2465
2412		long k, n, i, j;
	2466	long k, i, j;
2413	2467
2414	2468	if (x.k != y.k) LogicError("FFT rep mismatch");
2415	2469
2416	2470	k = x.k;
2417		n = 1L << k;
2418	2471
2419	2472	z.SetSize(k);
	2473
	2474	long len = z.len = min(x.len, y.len);
2420	2475
2421	2476	long nprimes = FFTInfo->NumPrimes;
2422	2477

2426	2481	const long *yp = &y.tbl[i][0];
2427	2482	long q = GetFFTPrime(i);
2428	2483
2429		for (j = 0; j < n; j++)
	2484	for (j = 0; j < len; j++)
2430	2485	zp[j] = AddMod(xp[j], yp[j], q);
2431	2486	}
2432	2487

2439	2494	{
2440	2495	BasicThreadPool *pool = GetThreadPool();
2441	2496
2442		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2497	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh1(1L << x.k)) {
2443	2498	basic_add(z, x, y);
2444	2499	return;
2445	2500	}
2446	2501	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2447	2502
2448		long k, n;
	2503	long k;
2449	2504
2450	2505	if (x.k != y.k) LogicError("FFT rep mismatch");
2451	2506
2452	2507	k = x.k;
2453		n = 1L << k;
2454	2508
2455	2509	z.SetSize(k);
2456	2510
	2511	long len = z.len = min(x.len, y.len);
	2512
2457	2513	long nprimes = FFTInfo->NumPrimes;
2458	2514
2459	2515	pool->exec_range(nprimes,
2460		[&x, &y, &z, n](long first, long last) {
	2516	[&x, &y, &z, len](long first, long last) {
2461	2517	for (long i = first; i < last; i++) {
2462	2518	long *zp = &z.tbl[i][0];
2463	2519	const long *xp = &x.tbl[i][0];
2464	2520	const long *yp = &y.tbl[i][0];
2465	2521	long q = GetFFTPrime(i);
2466	2522
2467		for (long j = 0; j < n; j++)
	2523	for (long j = 0; j < len; j++)
2468	2524	zp[j] = AddMod(xp[j], yp[j], q);
2469	2525	}
2470	2526	} );

2472	2528	}
2473	2529
2474	2530	#endif
2475
2476	2531
2477	2532
2478	2533

2492	2547	n = 1L << k;
2493	2548
2494	2549	if (l < k) LogicError("reduce: bad operands");
	2550	if (a.len < n) LogicError("reduce: bad len");
2495	2551
2496	2552	x.SetSize(k);
2497
	2553	x.len = n;
	2554
	2555	if (&x == &a) return;
2498	2556
2499	2557	long nprimes = FFTInfo->NumPrimes;
2500	2558

2502	2560	ap = &a.tbl[i][0];
2503	2561	xp = &x.tbl[i][0];
2504	2562	for (j = 0; j < n; j++)
2505		xp[j] = ap[j << (l-k)];
	2563	xp[j] = ap[j];
2506	2564	}
2507	2565	}
2508	2566

2515	2573	{
2516	2574	BasicThreadPool *pool = GetThreadPool();
2517	2575
2518		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2576	if (&x == &a \|\| !pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh1(1L << k)) {
2519	2577	basic_reduce(x, a, k);
2520	2578	return;
2521	2579	}

2528	2586	n = 1L << k;
2529	2587
2530	2588	if (l < k) LogicError("reduce: bad operands");
	2589	if (a.len < n) LogicError("reduce: bad len");
2531	2590
2532	2591	x.SetSize(k);
	2592	x.len = n;
2533	2593
2534	2594
2535	2595	long nprimes = FFTInfo->NumPrimes;

2540	2600	const long *ap = &a.tbl[i][0];
2541	2601	long *xp = &x.tbl[i][0];
2542	2602	for (long j = 0; j < n; j++)
2543		xp[j] = ap[j << (l-k)];
	2603	xp[j] = ap[j];
2544	2604	}
2545	2605	} );
2546	2606	}
2547	2607
	2608
2548	2609	#endif
	2610
2549	2611
2550	2612
2551	2613

2562	2624	n = 1L << k;
2563	2625
2564	2626	if (l < k) LogicError("AddExpand: bad args");
	2627
	2628	if (a.len != n) LogicError("AddExpand: bad len");
	2629	if (x.len < n) LogicError("AddExpand: bad len");
2565	2630
2566	2631
2567	2632	long nprimes = FFTInfo->NumPrimes;

2571	2636	const long *ap = &a.tbl[i][0];
2572	2637	long *xp = &x.tbl[i][0];
2573	2638	for (j = 0; j < n; j++) {
2574		long j1 = j << (l-k);
2575		xp[j1] = AddMod(xp[j1], ap[j], q);
	2639	xp[j] = AddMod(xp[j], ap[j], q);
2576	2640	}
2577	2641	}
2578	2642	}

2584	2648	{
2585	2649	BasicThreadPool *pool = GetThreadPool();
2586	2650
2587		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2651	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh1(1L << a.k)) {
2588	2652	basic_AddExpand(x, a);
2589	2653	return;
2590	2654	}

2597	2661	n = 1L << k;
2598	2662
2599	2663	if (l < k) LogicError("AddExpand: bad args");
	2664
	2665	if (a.len != n) LogicError("AddExpand: bad len");
	2666	if (x.len < n) LogicError("AddExpand: bad len");
2600	2667
2601	2668
2602	2669	long nprimes = FFTInfo->NumPrimes;

2608	2675	const long *ap = &a.tbl[i][0];
2609	2676	long *xp = &x.tbl[i][0];
2610	2677	for (long j = 0; j < n; j++) {
2611		long j1 = j << (l-k);
2612		xp[j1] = AddMod(xp[j1], ap[j], q);
	2678	xp[j] = AddMod(xp[j], ap[j], q);
2613	2679	}
2614	2680	}
2615	2681	} );
2616	2682	}
2617	2683
2618
2619	2684	#endif
	2685
	2686
	2687
2620	2688
2621	2689
2622	2690

2656	2724	{
2657	2725	BasicThreadPool *pool = GetThreadPool();
2658	2726
2659		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2727	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(max(hi-lo+1,0))) {
2660	2728	basic_ToZZ_pXModRep(y, x, lo, hi);
2661	2729	return;
2662	2730	}

2727	2795
2728	2796
2729	2797	x.SetSize(k);
	2798	x.len = n;
2730	2799
2731	2800	long nprimes = FFTInfo->NumPrimes;
2732	2801

2756	2825	{
2757	2826	BasicThreadPool *pool = GetThreadPool();
2758	2827
2759		if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2828	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| BelowThresh(1L << k)) {
2760	2829	basic_ToFFTRep(x, a, k, lo, hi);
2761	2830	return;
2762	2831	}

2778	2847
2779	2848
2780	2849	x.SetSize(k);
	2850	x.len = n;
2781	2851
2782	2852	long nprimes = FFTInfo->NumPrimes;
2783	2853

2818	2888	long k = a.k;
2819	2889	long n = 1L << k;
2820	2890
	2891	if (a.len != n) LogicError("FromFFTRep: bad len");
	2892
2821	2893	x.SetSize(n);
2822	2894	for (long i = 0; i < nprimes; i++) {
2823	2895	long *xp = &x.tbl[i][0];

2854	2926
2855	2927
2856	2928
2857
2858
2859
2860
2861
2862	2929	void FFTMul(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& b)
2863	2930	{
2864		long k, d;
2865
2866	2931	if (IsZero(a) \|\| IsZero(b)) {
2867	2932	clear(x);
2868	2933	return;
2869	2934	}
2870	2935
2871		d = deg(a) + deg(b);
2872		k = NextPowerOfTwo(d+1);
	2936	long da = deg(a);
	2937	long db = deg(b);
	2938	long d = da+db;
	2939	long k = NextPowerOfTwo(d+1);
2873	2940
2874	2941	FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, k);
2875	2942
2876		ToFFTRep(R1, a, k);
2877		ToFFTRep(R2, b, k);
	2943	ToFFTRep_trunc(R1, a, k, d+1);
	2944	ToFFTRep_trunc(R2, b, k, d+1);
2878	2945	mul(R1, R1, R2);
2879	2946	FromFFTRep(x, R1, 0, d);
2880	2947	}
2881	2948
2882	2949	void FFTSqr(ZZ_pX& x, const ZZ_pX& a)
2883	2950	{
2884		long k, d;
2885
2886	2951	if (IsZero(a)) {
2887	2952	clear(x);
2888	2953	return;
2889	2954	}
2890	2955
2891		d = 2*deg(a);
2892		k = NextPowerOfTwo(d+1);
	2956	long da = deg(a);
	2957	long d = 2*da;
	2958	long k = NextPowerOfTwo(d+1);
2893	2959
2894	2960	FFTRep R1(INIT_SIZE, k);
2895	2961
2896		ToFFTRep(R1, a, k);
	2962	ToFFTRep_trunc(R1, a, k, d+1);
2897	2963	mul(R1, R1, R1);
2898	2964	FromFFTRep(x, R1, 0, d);
2899	2965	}
	2966
2900	2967
2901	2968
2902	2969	void CopyReverse(ZZ_pX& x, const ZZ_pX& a, long lo, long hi)

2978	3045	FFTRep R1(INIT_SIZE, F.l);
2979	3046	ZZ_pX P1(INIT_SIZE, n);
2980	3047
2981		ToFFTRep(R1, a, F.l, n, 2*(n-1));
	3048	ToFFTRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
2982	3049	mul(R1, R1, F.HRep);
2983	3050	FromFFTRep(P1, R1, n-2, 2*n-4);
2984	3051

3033	3100	FFTRep R1(INIT_SIZE, F.l);
3034	3101	ZZ_pX P1(INIT_SIZE, n), qq;
3035	3102
3036		ToFFTRep(R1, a, F.l, n, 2*(n-1));
	3103	ToFFTRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
3037	3104	mul(R1, R1, F.HRep);
3038	3105	FromFFTRep(P1, R1, n-2, 2*n-4);
3039	3106	qq = P1;

3089	3156	FFTRep R1(INIT_SIZE, F.l);
3090	3157	ZZ_pX P1(INIT_SIZE, n);
3091	3158
3092		ToFFTRep(R1, a, F.l, n, 2*(n-1));
	3159	ToFFTRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
3093	3160	mul(R1, R1, F.HRep);
3094	3161	FromFFTRep(x, R1, n-2, 2*n-4);
3095	3162	}

3290	3357	FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
3291	3358	ZZ_pX P1(INIT_SIZE, n);
3292	3359
3293		ToFFTRep(R1, a, k);
3294		ToFFTRep(R2, b, k);
	3360	ToFFTRep_trunc(R1, a, k, max(1L << F.k, d));
	3361	ToFFTRep_trunc(R2, b, k, max(1L << F.k, d));
3295	3362	mul(R1, R1, R2);
3296	3363	NDFromFFTRep(P1, R1, n, d-1, R2); // save R1 for future use
3297	3364
3298		ToFFTRep(R2, P1, F.l);
	3365	ToFFTRep_trunc(R2, P1, F.l, 2*n-3);
3299	3366	mul(R2, R2, F.HRep);
3300	3367	FromFFTRep(P1, R2, n-2, 2*n-4);
3301	3368

3334	3401	FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
3335	3402	ZZ_pX P1(INIT_SIZE, n);
3336	3403
3337		ToFFTRep(R1, a, k);
	3404	ToFFTRep_trunc(R1, a, k, max(1L << F.k, d));
3338	3405	mul(R1, R1, R1);
3339	3406	NDFromFFTRep(P1, R1, n, d-1, R2); // save R1 for future use
3340	3407
3341		ToFFTRep(R2, P1, F.l);
	3408	ToFFTRep_trunc(R2, P1, F.l, 2*n-3);
3342	3409	mul(R2, R2, F.HRep);
3343	3410	FromFFTRep(P1, R2, n-2, 2*n-4);
3344	3411

3554	3621	ZZ_pX P1(INIT_SIZE, n);
3555	3622
3556	3623
3557		ToFFTRep(R1, b, F.l);
	3624	ToFFTRep_trunc(R1, b, F.l, 2*n-2);
3558	3625	reduce(x.B2, R1, F.k);
3559	3626	mul(R1, R1, F.HRep);
3560	3627	FromFFTRep(P1, R1, n-1, 2*n-3);
	3628
3561	3629	ToFFTRep(x.B1, P1, F.l);
	3630	// could be truncated to length max(1L << F.k, 2*n-2), except
	3631	// for the usage in UpdateMap, where we would have to investigate
	3632	// further
3562	3633	}
3563	3634
3564	3635

3589	3660	ZZ_pX P1(INIT_SIZE, n), P2(INIT_SIZE, n);
3590	3661	FFTRep R1(INIT_SIZE, F.l), R2(INIT_SIZE, F.l);
3591	3662
3592		ToFFTRep(R1, a, F.l);
	3663	ToFFTRep_trunc(R1, a, F.l, max(1L << F.k, 2*n-2));
3593	3664	mul(R2, R1, B.B1);
3594	3665	FromFFTRep(P1, R2, n-1, 2*n-3);
3595	3666

-1

src/ZZ_pX1.cpp less more

296	296	}
297	297
298	298
299		void mul(ZZ_pXMatrix& A, ZZ_pXMatrix& B, ZZ_pXMatrix& C)
	299	void mul(ZZ_pXMatrix& A, ZZ_pXMatrix& B, ZZ_pXMatrix& C)
300	300	// A = B*C, B and C are destroyed
301	301	{
302	302	long db = deg(B(1,1));

+416

-0

src/ZZ_pXTest.cpp less more

	0	#include <NTL/ZZ_pX.h>
	1	#include <NTL/ZZX.h>
	2	#include <NTL/BasicThreadPool.h>
	3
	4	NTL_CLIENT
	5
	6
	7	#define ITER (500)
	8
	9
	10	void multest()
	11	{
	12	cerr << "mul";
	13	for (long iter = 0; iter < ITER; iter++) {
	14	if (iter % 100 == 0) cerr << ".";
	15
	16	long da, db;
	17
	18	if (RandomBnd(2)) {
	19	da = RandomBnd(5000) + 100;
	20	db = RandomBnd(5000) + 100;
	21	}
	22	else {
	23	da = RandomBnd(200) + 1;
	24	db = RandomBnd(200) + 1;
	25	}
	26
	27	ZZ_pX a, b, c1, c2;
	28
	29	random(a, da);
	30	random(b, db);
	31
	32	FFTMul(c1, a, b);
	33
	34	ZZX A, B, C;
	35	conv(A, a);
	36	conv(B, b);
	37	mul(C, A, B);
	38	conv(c2, C);
	39
	40	if (c1 != c2) {
	41	cerr << "******* oops\n";
	42	break;
	43	}
	44	}
	45
	46	cerr << "\n";
	47	}
	48
	49
	50	void sqrtest()
	51	{
	52	cerr << "sqr";
	53	for (long iter = 0; iter < ITER; iter++) {
	54	if (iter % 100 == 0) cerr << ".";
	55
	56	long da = RandomBnd(5000) + 100;
	57	long db = RandomBnd(5000) + 100;
	58
	59	ZZ_pX a, b, c1, c2;
	60
	61	random(a, da);
	62
	63	if (deg(a) < 80) {
	64	cerr << "*";
	65	continue;
	66	}
	67
	68	FFTSqr(c1, a);
	69
	70	ZZX A, B, C;
	71	conv(A, a);
	72	sqr(C, A);
	73	conv(c2, C);
	74
	75	if (c1 != c2) {
	76	cerr << "******* oops\n";
	77	break;
	78	}
	79	}
	80
	81	cerr << "\n";
	82	}
	83
	84
	85
	86
	87	void mulmodtest()
	88	{
	89	cerr << "mulmod";
	90	for (long iter = 0; iter < ITER; iter++) {
	91	if (iter % 100 == 0) cerr << ".";
	92
	93	long n = RandomBnd(5000) + 300;
	94	long da = RandomBnd(n)+1;
	95	long db = RandomBnd(n)+1;
	96
	97	if (RandomBnd(2)) { da = n; db = n; }
	98
	99	ZZ_pX f;
	100	random(f, n);
	101	SetCoeff(f, n);
	102	ZZ_pXModulus F(f);
	103
	104	ZZ_pX a, b, c1, c2;
	105	random(a, da);
	106	random(b, db);
	107
	108	MulMod(c1, a, b, F);
	109
	110	ZZX A, B, C;
	111	conv(A, a);
	112	conv(B, b);
	113	mul(C, A, B);
	114	conv(c2, C);
	115	rem(c2, c2, F);
	116
	117	if (c1 != c2) {
	118	cerr << "******** oops\n";
	119	break;
	120	}
	121	}
	122
	123	cerr << "\n";
	124	}
	125
	126
	127	void sqrmodtest()
	128	{
	129	cerr << "sqrmod";
	130	for (long iter = 0; iter < ITER; iter++) {
	131	if (iter % 100 == 0) cerr << ".";
	132
	133	long n = RandomBnd(5000) + 300;
	134	long da = RandomBnd(n)+1;
	135	long db = RandomBnd(n)+1;
	136
	137	if (RandomBnd(2)) { da = n; db = n; }
	138
	139	ZZ_pX f;
	140	random(f, n);
	141	SetCoeff(f, n);
	142	ZZ_pXModulus F(f);
	143
	144	ZZ_pX a, b, c1, c2;
	145	random(a, da);
	146	random(b, db);
	147
	148	SqrMod(c1, a, F);
	149
	150	ZZX A, B, C;
	151	conv(A, a);
	152	conv(B, b);
	153	sqr(C, A);
	154	conv(c2, C);
	155	rem(c2, c2, F);
	156
	157	if (c1 != c2) {
	158	cerr << "******** oops\n";
	159	break;
	160	}
	161	}
	162
	163	cerr << "\n";
	164	}
	165
	166
	167
	168	void mulmod1test()
	169	{
	170	cerr << "mulmod1";
	171	for (long iter = 0; iter < ITER; iter++) {
	172	if (iter % 100 == 0) cerr << ".";
	173
	174	long n = RandomBnd(5000) + 300;
	175	long da = RandomBnd(n)+1;
	176	long db = RandomBnd(n)+1;
	177
	178	if (RandomBnd(2)) { da = n; db = n; }
	179
	180	ZZ_pX f;
	181	random(f, n);
	182	SetCoeff(f, n);
	183	ZZ_pXModulus F(f);
	184
	185	ZZ_pX a, b, c1, c2;
	186	random(a, da);
	187	random(b, db);
	188
	189	ZZ_pXMultiplier bb;
	190	build(bb, b, F);
	191
	192	MulMod(c1, a, bb, F);
	193
	194	ZZX A, B, C;
	195	conv(A, a);
	196	conv(B, b);
	197	mul(C, A, B);
	198	conv(c2, C);
	199	rem(c2, c2, F);
	200
	201	if (c1 != c2) {
	202	cerr << "******** oops\n";
	203	break;
	204	}
	205	}
	206
	207	cerr << "\n";
	208	}
	209
	210
	211	namespace NTL {
	212
	213	void CopyReverse(ZZ_pX& x, const ZZ_pX& a, long lo, long hi);
	214
	215	}
	216
	217
	218
	219	struct ZZ_pXTransMultiplier {
	220	ZZ_pX f0, fbi, b;
	221	long shamt, shamt_fbi, shamt_b;
	222	};
	223
	224
	225
	226
	227	void build(ZZ_pXTransMultiplier& B, const ZZ_pX& b, const ZZ_pXModulus& F)
	228	{
	229	long db = deg(b);
	230
	231	if (db >= F.n) LogicError("build TransMultiplier: bad args");
	232
	233	ZZ_pX t;
	234
	235	LeftShift(t, b, F.n-1);
	236	div(t, t, F);
	237
	238	// we optimize for low degree b
	239
	240	long d;
	241
	242	d = deg(t);
	243	if (d < 0)
	244	B.shamt_fbi = 0;
	245	else
	246	B.shamt_fbi = F.n-2 - d;
	247
	248	CopyReverse(B.fbi, t, 0, d);
	249
	250	// The following code optimizes the case when
	251	// f = X^n + low degree poly
	252
	253	trunc(t, F.f, F.n);
	254	d = deg(t);
	255	if (d < 0)
	256	B.shamt = 0;
	257	else
	258	B.shamt = d;
	259
	260	CopyReverse(B.f0, t, 0, d);
	261
	262	if (db < 0)
	263	B.shamt_b = 0;
	264	else
	265	B.shamt_b = db;
	266
	267	CopyReverse(B.b, b, 0, db);
	268	}
	269
	270
	271
	272	void TransMulMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pXTransMultiplier& B,
	273	const ZZ_pXModulus& F)
	274	{
	275	if (deg(a) >= F.n) LogicError("TransMulMod: bad args");
	276
	277	ZZ_pX t1, t2;
	278
	279	mul(t1, a, B.b);
	280	RightShift(t1, t1, B.shamt_b);
	281
	282	mul(t2, a, B.f0);
	283	RightShift(t2, t2, B.shamt);
	284	trunc(t2, t2, F.n-1);
	285
	286	mul(t2, t2, B.fbi);
	287	if (B.shamt_fbi > 0) LeftShift(t2, t2, B.shamt_fbi);
	288	trunc(t2, t2, F.n-1);
	289	LeftShift(t2, t2, 1);
	290
	291	sub(x, t1, t2);
	292	}
	293
	294
	295
	296	void UpdateMap(vec_ZZ_p& x, const vec_ZZ_p& a,
	297	const ZZ_pXTransMultiplier& B, const ZZ_pXModulus& F)
	298	{
	299	ZZ_pX xx;
	300	TransMulMod(xx, to_ZZ_pX(a), B, F);
	301	x = xx.rep;
	302	}
	303
	304
	305
	306	void updatetest()
	307	{
	308	cerr << "update";
	309	for (long iter = 0; iter < ITER; iter++) {
	310	if (iter % 100 == 0) cerr << ".";
	311
	312	long n = RandomBnd(5000) + 300;
	313	long da = RandomBnd(n)+1;
	314	long db = RandomBnd(n)+1;
	315
	316	if (RandomBnd(2)) { da = n; db = n; }
	317
	318	ZZ_pX f;
	319	random(f, n);
	320	SetCoeff(f, n);
	321	ZZ_pXModulus F(f);
	322
	323	ZZ_pX a, b;
	324	random(a, da);
	325	random(b, db);
	326
	327	ZZ_pXMultiplier bb1;
	328	build(bb1, b, F);
	329
	330	ZZ_pXTransMultiplier bb2;
	331	build(bb2, b, F);
	332
	333	Vec<ZZ_p> x1, x2;
	334
	335	UpdateMap(x1, a.rep, bb1, F);
	336	UpdateMap(x2, a.rep, bb2, F);
	337
	338
	339	if (x1 != x2) {
	340	cerr << "******** oops\n";
	341	break;
	342	}
	343	}
	344
	345	cerr << "\n";
	346	}
	347
	348	void divremtest()
	349	{
	350	cerr << "divrem";
	351	for (long iter = 0; iter < ITER; iter++) {
	352	if (iter % 100 == 0) cerr << ".";
	353
	354	long n = RandomBnd(5000) + 300;
	355	long dq = RandomBnd(n);
	356
	357
	358	ZZ_pX f;
	359	random(f, n);
	360	SetCoeff(f, n);
	361	ZZ_pXModulus F(f);
	362
	363	ZZ_pX a, q, r, q1, r1;
	364
	365	random(a, 2*n-1);
	366
	367	DivRem(q, r, a, F);
	368	rem(r1, a, F);
	369	div(q1, a, F);
	370
	371	if (deg(r) >= n \|\| a != q*f + r \|\| q != q1 \|\| r != r1) {
	372	cerr << "******** oops\n";
	373	break;
	374	}
	375	}
	376
	377	cerr << "\n";
	378	}
	379
	380
	381	int main()
	382	{
	383	ZZ p;
	384	GenPrime(p, 100);
	385
	386	ZZ_p::init(p);
	387
	388	multest();
	389	sqrtest();
	390	mulmodtest();
	391	sqrmodtest();
	392	mulmod1test();
	393	divremtest();
	394	updatetest();
	395
	396	#ifdef NTL_THREAD_BOOST
	397
	398	GenPrime(p, 500);
	399	ZZ_p::init(p);
	400
	401	SetNumThreads(4);
	402	cerr << "numthreads=4\n";
	403
	404	multest();
	405	sqrtest();
	406	mulmodtest();
	407	sqrmodtest();
	408	mulmod1test();
	409	divremtest();
	410	updatetest();
	411
	412	#endif
	413
	414	}
	415

+20

-0

src/cfile less more

224	224
225	225	#endif
226	226
	227	#if @{NTL_ENABLE_AVX_FFT}
	228	#define NTL_ENABLE_AVX_FFT
	229
	230	/*
	231	* This will compile NTL in a way that enables an AVX implemention
	232	* of the small-prime FFT.
	233	*/
	234
	235	#endif
	236
	237
	238	#if @{NTL_AVOID_AVX512}
	239	#define NTL_AVOID_AVX512
	240
	241	/*
	242	* This will compile NTL in a way that avoids 512-bit operations,
	243	* even if AVX512 is available.
	244	*/
	245
	246	#endif
227	247
228	248	#if @{NTL_RANGE_CHECK}
229	249	#define NTL_RANGE_CHECK

-1

src/ctools.cpp less more

133	133	/*
134	134	* On machines with wide floating point registers, the routine _ntl_ForceToMem
135	135	* is used to force a floating point double to a memory location.
136		*
	136	* I've checked with GCC, and even with LTO, this will work.
	137	* That said, I wouln't really recommend applying LTO to NTL...
137	138	*/
138	139
139	140	void _ntl_ForceToMem(double *p)

163	164
164	165	double _ntl_ldexp(double x, long e)
165	166	{
	167	if (x == 0.0) return x;
	168
166	169	if (e > NTL_MAX_INT)
167	170	return x/_ntl_ldexp_zero;
168	171	else if (e < NTL_MIN_INT)

-0

src/gen_gmp_aux.cpp less more

102	102	if (nail_bits > 0)
103	103	fprintf(stderr, "WARNING: GMP_NAIL_BITS > 0: this has not been well tested\n");
104	104
	105	if (__GNU_MP_VERSION < 5)
	106	Error("GMP version 5.0.0 or later required");
105	107
106	108	// check that GMP_LIMB_BITS == mp_bits_per_limb as a consistency check
107	109	if (GMP_LIMB_BITS != mp_bits_per_limb)

+861

-83

src/lip.cpp less more

35	35
36	36	#ifdef NTL_GMP_LIP
37	37	#include <gmp.h>
	38
	39	#if (__GNU_MP_VERSION < 5)
	40	#error "GMP version 5.0.0 or later required"
	41	#endif
	42
38	43	#endif
39	44
40	45	NTL_IMPORT_FROM_STD
41	46	NTL_USE_NNS
42	47
43	48
	49	#if (defined(NTL_HAVE_LL_TYPE) && NTL_BITS_PER_LIMB_T == NTL_BITS_PER_LONG)
	50	#define NTL_VIABLE_LL
	51	#endif
44	52
45	53
46	54	#ifdef NTL_GMP_LIP

225	233	{
226	234	long i;
227	235
228		i = 0;
229		do
230		{
231		_ntl_limb_t r = ap[i] + b;
232		rp[i] = CLIP(r);
233		b = r >> NTL_ZZ_NBITS;
234		}
235		while (++i < n);
236
237		return b;
	236	if (rp != ap) {
	237	i = 0;
	238	do
	239	{
	240	_ntl_limb_t r = ap[i] + b;
	241	rp[i] = CLIP(r);
	242	b = r >> NTL_ZZ_NBITS;
	243	}
	244	while (++i < n);
	245
	246	return b;
	247	}
	248	else {
	249	i = 0;
	250	do
	251	{
	252	if (!b) return 0;
	253	_ntl_limb_t r = ap[i] + b;
	254	rp[i] = CLIP(r);
	255	b = r >> NTL_ZZ_NBITS;
	256	}
	257	while (++i < n);
	258
	259	return b;
	260	}
238	261	}
239	262
240	263

272	295	{
273	296	long i;
274	297
275		i = 0;
276		do
277		{
278		_ntl_limb_t r = ap[i] - b;
279		rp[i] = CLIP(r);
280		b = (r >> NTL_ZZ_NBITS) & 1;
281		}
282		while (++i < n);
283
284		return b;
	298	if (rp != ap) {
	299	i = 0;
	300	do
	301	{
	302	_ntl_limb_t r = ap[i] - b;
	303	rp[i] = CLIP(r);
	304	b = (r >> NTL_ZZ_NBITS) & 1;
	305	}
	306	while (++i < n);
	307
	308	return b;
	309	}
	310	else {
	311	i = 0;
	312	do
	313	{
	314	if (!b) return 0;
	315	_ntl_limb_t r = ap[i] - b;
	316	rp[i] = CLIP(r);
	317	b = (r >> NTL_ZZ_NBITS) & 1;
	318	}
	319	while (++i < n);
	320
	321	return b;
	322	}
	323
285	324	}
286	325
287	326

824	863	_ntl_mpn_base_sqr(c, a, sa);
825	864	}
826	865
	866
	867	// Like the corresponding GMP routine, this assumes un >= vn >= 1
827	868	_ntl_limb_t
828	869	_ntl_mpn_mul (_ntl_limb_t* rp, const _ntl_limb_t* up, long un, const _ntl_limb_t* vp, long vn)
829	870	{

986	1027	}
987	1028	}
988	1029
989
990		#else
991
992
993
994		#if (__GNU_MP_VERSION < 3)
995
996		#error "You have to use GMP version >= 3.1"
997
998		#endif
999
1000		#if ((__GNU_MP_VERSION == 3) && (__GNU_MP_VERSION_MINOR < 1))
1001
1002		#error "You have to use GMP version >= 3.1"
1003
1004		#endif
1005
1006
1007
1008		/* v 3.1 is supposed mpn_tdiv_qr defined, but it doesn't.
1009		Here's a workaround */
1010
1011		#if ((__GNU_MP_VERSION == 3) && (__GNU_MP_VERSION_MINOR == 1) && (__GNU_MP_VERSION_PATCHLEVEL == 0))
1012
1013		#define mpn_tdiv_qr __MPN(tdiv_qr)
1014
1015
1016		extern "C"
1017		void mpn_tdiv_qr(mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
1018		mp_srcptr, mp_size_t);
1019
1020		#endif
1021	1030
1022	1031	#endif
1023	1032

1078	1087
1079	1088	static
1080	1089	inline long& ALLOC(_ntl_gbigint p)
1081		{ return (((long *) p)[0]); }
	1090	{ return p->alloc_; }
1082	1091
1083	1092	static
1084	1093	inline long& SIZE(_ntl_gbigint p)
1085		{ return (((long *) p)[1]); }
	1094	{ return p->size_; }
1086	1095
1087	1096	static
1088	1097	inline _ntl_limb_t * DATA(_ntl_gbigint p)
1089		{ return ((_ntl_limb_t ) (((long ) (p)) + 2)); }
	1098	{ return (_ntl_limb_t *) (p+1); }
1090	1099
1091	1100	static
1092	1101	inline long STORAGE(long len)
1093		{ return ((long)(2sizeof(long) + (len)sizeof(_ntl_limb_t))); }
	1102	{ return ((long)(sizeof(_ntl_gbigint_body) + (len)*sizeof(_ntl_limb_t))); }
1094	1103
1095	1104	static
1096	1105	inline long MustAlloc(_ntl_gbigint c, long len)

1177	1186
1178	1187
1179	1188
1180		#if (defined(NTL_HAVE_LL_TYPE) && NTL_BITS_PER_LIMB_T == NTL_BITS_PER_LONG)
1181		#define NTL_VIABLE_LL
1182		#endif
1183	1189
1184	1190	#if (defined(NTL_CRT_ALTCODE) \|\| defined(NTL_CRT_ALTCODE_SMALL))
1185	1191

1376	1382
1377	1383	len++; /* always allocate at least one more than requested */
1378	1384
1379		oldlen = (long) (oldlen * 1.4); /* always increase by at least 40% */
	1385	oldlen = _ntl_vec_grow(oldlen);
1380	1386	if (len < oldlen)
1381	1387	len = oldlen;
1382	1388

1634	1640
1635	1641	if (b<0) LogicError("_ntl_gsetbit: negative index");
1636	1642
1637		if (ZEROP(*a)) {
1638		_ntl_gintoz(1, a);
1639		_ntl_glshift(*a, b, a);
1640		return 0;
1641		}
1642
1643	1643	bl = (b/NTL_ZZ_NBITS);
1644	1644	wh = ((_ntl_limb_t) 1) << (b - NTL_ZZ_NBITS*bl);
1645	1645
1646		GET_SIZE_NEG(sa, aneg, *a);
	1646	if (!*a)
	1647	sa = aneg = 0;
	1648	else
	1649	GET_SIZE_NEG(sa, aneg, *a);
1647	1650
1648	1651	if (sa > bl) {
1649	1652	adata = DATA(*a);

1675	1678
1676	1679	if (b<0) LogicError("_ntl_gswitchbit: negative index");
1677	1680
1678
1679		if (ZEROP(*a)) {
1680		_ntl_gintoz(1, a);
1681		_ntl_glshift(*a, b, a);
1682		return 0;
1683		}
1684
1685	1681	bl = (b/NTL_ZZ_NBITS);
1686	1682	wh = ((_ntl_limb_t) 1) << (b - NTL_ZZ_NBITS*bl);
1687	1683
1688		GET_SIZE_NEG(sa, aneg, *a);
	1684	if (!*a)
	1685	sa = aneg = 0;
	1686	else
	1687	GET_SIZE_NEG(sa, aneg, *a);
1689	1688
1690	1689	if (sa > bl) {
1691	1690	adata = DATA(*a);

2514	2513	}
2515	2514	}
2516	2515
	2516
2517	2517	void
2518	2518	_ntl_gsadd(_ntl_gbigint a, long b, _ntl_gbigint *cc)
2519	2519	{
2520		// FIXME: this is really inefficient...too much overhead
2521		GRegister(B);
2522		_ntl_gintoz(b, &B);
2523		_ntl_gadd(a, B, cc);
2524		}
	2520	if (b == 0) {
	2521	_ntl_gcopy(a, cc);
	2522	return;
	2523	}
	2524
	2525	_ntl_limb_t abs_b = ABS(b);
	2526
	2527	if (XCLIP(abs_b)) {
	2528	GRegister(xb);
	2529	_ntl_gintoz(b,&xb);
	2530	_ntl_gadd(a, xb, cc);
	2531	return;
	2532	}
	2533
	2534	long bneg = b < 0;
	2535
	2536
	2537	if (ZEROP(a)) {
	2538	if (!*cc) _ntl_gsetlength(cc, 1);
	2539	SIZE(cc) = 1 - 2bneg;
	2540	DATA(*cc)[0] = abs_b;
	2541	return;
	2542	}
	2543
	2544	long sa, aneg;
	2545
	2546	GET_SIZE_NEG(sa, aneg, a);
	2547
	2548	if (aneg == bneg) {
	2549	// signs equal: addition
	2550
	2551	if (a == *cc) {
	2552	// a aliases c
	2553
	2554	_ntl_limb_t *adata = DATA(a);
	2555	_ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, abs_b);
	2556
	2557	if (carry) {
	2558	if (MustAlloc(a, sa+1)) {
	2559	_ntl_gsetlength(cc, sa+1);
	2560	a = *cc;
	2561	adata = DATA(a);
	2562	}
	2563	adata[sa] = 1;
	2564	sa++;
	2565	if (aneg) sa = -sa;
	2566	SIZE(a) = sa;
	2567	}
	2568	}
	2569	else {
	2570	// a and c do not alias
	2571	if (MustAlloc(*cc, sa+1)) _ntl_gsetlength(cc, sa+1);
	2572	_ntl_limb_t *adata = DATA(a);
	2573	_ntl_limb_t cdata = DATA(cc);
	2574	_ntl_limb_t carry = NTL_MPN(add_1)(cdata, adata, sa, abs_b);
	2575	if (carry) {
	2576	cdata[sa] = 1;
	2577	sa++;
	2578	}
	2579	if (aneg) sa = -sa;
	2580	SIZE(*cc) = sa;
	2581	}
	2582	}
	2583	else {
	2584	// opposite sign: subtraction
	2585
	2586	if (sa == 1) {
	2587	_ntl_limb_t abs_a = DATA(a)[0];
	2588	if (abs_a == abs_b)
	2589	_ntl_gzero(cc);
	2590	else if (abs_a > abs_b) {
	2591	if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
	2592	DATA(*cc)[0] = abs_a - abs_b;
	2593	SIZE(cc) = 1-2aneg;
	2594	}
	2595	else {
	2596	if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
	2597	DATA(*cc)[0] = abs_b - abs_a;
	2598	SIZE(cc) = -1+2aneg;
	2599	}
	2600	}
	2601	else {
	2602	if (MustAlloc(*cc, sa)) _ntl_gsetlength(cc, sa);
	2603	_ntl_limb_t *adata = DATA(a);
	2604	_ntl_limb_t cdata = DATA(cc);
	2605	NTL_MPN(sub_1)(cdata, adata, sa, abs_b);
	2606	if (cdata[sa-1] == 0) sa--;
	2607	if (aneg) sa = -sa;
	2608	SIZE(*cc) = sa;
	2609	}
	2610	}
	2611
	2612	}
	2613
	2614	void
	2615	_ntl_gssub(_ntl_gbigint a, long b, _ntl_gbigint *cc)
	2616	{
	2617	if (b == 0) {
	2618	_ntl_gcopy(a, cc);
	2619	return;
	2620	}
	2621
	2622	_ntl_limb_t abs_b = ABS(b);
	2623
	2624	if (XCLIP(abs_b)) {
	2625	GRegister(xb);
	2626	_ntl_gintoz(b,&xb);
	2627	_ntl_gsub(a, xb, cc);
	2628	return;
	2629	}
	2630
	2631	// the rest of this routine is precisely the same
	2632	// as gsadd, except for the following line,
	2633	// which has the sense of the test reversed
	2634	long bneg = b >= 0;
	2635
	2636
	2637	if (ZEROP(a)) {
	2638	if (!*cc) _ntl_gsetlength(cc, 1);
	2639	SIZE(cc) = 1 - 2bneg;
	2640	DATA(*cc)[0] = abs_b;
	2641	return;
	2642	}
	2643
	2644	long sa, aneg;
	2645
	2646	GET_SIZE_NEG(sa, aneg, a);
	2647
	2648	if (aneg == bneg) {
	2649	// signs equal: addition
	2650
	2651	if (a == *cc) {
	2652	// a aliases c
	2653
	2654	_ntl_limb_t *adata = DATA(a);
	2655	_ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, abs_b);
	2656
	2657	if (carry) {
	2658	if (MustAlloc(a, sa+1)) {
	2659	_ntl_gsetlength(cc, sa+1);
	2660	a = *cc;
	2661	adata = DATA(a);
	2662	}
	2663	adata[sa] = 1;
	2664	sa++;
	2665	if (aneg) sa = -sa;
	2666	SIZE(a) = sa;
	2667	}
	2668	}
	2669	else {
	2670	// a and c do not alias
	2671	if (MustAlloc(*cc, sa+1)) _ntl_gsetlength(cc, sa+1);
	2672	_ntl_limb_t *adata = DATA(a);
	2673	_ntl_limb_t cdata = DATA(cc);
	2674	_ntl_limb_t carry = NTL_MPN(add_1)(cdata, adata, sa, abs_b);
	2675	if (carry) {
	2676	cdata[sa] = 1;
	2677	sa++;
	2678	}
	2679	if (aneg) sa = -sa;
	2680	SIZE(*cc) = sa;
	2681	}
	2682	}
	2683	else {
	2684	// opposite sign: subtraction
	2685
	2686	if (sa == 1) {
	2687	_ntl_limb_t abs_a = DATA(a)[0];
	2688	if (abs_a == abs_b)
	2689	_ntl_gzero(cc);
	2690	else if (abs_a > abs_b) {
	2691	if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
	2692	DATA(*cc)[0] = abs_a - abs_b;
	2693	SIZE(cc) = 1-2aneg;
	2694	}
	2695	else {
	2696	if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
	2697	DATA(*cc)[0] = abs_b - abs_a;
	2698	SIZE(cc) = -1+2aneg;
	2699	}
	2700	}
	2701	else {
	2702	if (MustAlloc(*cc, sa)) _ntl_gsetlength(cc, sa);
	2703	_ntl_limb_t *adata = DATA(a);
	2704	_ntl_limb_t cdata = DATA(cc);
	2705	NTL_MPN(sub_1)(cdata, adata, sa, abs_b);
	2706	if (cdata[sa-1] == 0) sa--;
	2707	if (aneg) sa = -sa;
	2708	SIZE(*cc) = sa;
	2709	}
	2710	}
	2711
	2712	}
	2713
	2714
2525	2715
2526	2716	void
2527	2717	_ntl_gsub(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)

2668	2858	SIZE(c) = sc;
2669	2859	}
2670	2860
	2861	#if 1
	2862
	2863	// This version is faster for small inputs.
	2864	// It avoids some overheads incurred only when dealing with
	2865	// aliased outputs.
	2866	// It also makes direct calls to lower-level mpn functions
	2867	// for smaller inputs (and for one limb inputs, it avoids
	2868	// function calls altogether (usually)).
	2869
	2870	// Speedup: 2.5x 1 limb
	2871	// 1.4x 2 limb
	2872	// 1.3x 3 limb
	2873
	2874	static inline _ntl_limb_t
	2875	base_mul (_ntl_limb_t* rp, const _ntl_limb_t* up, long un, const _ntl_limb_t* vp, long vn)
	2876	{
	2877	rp[un] = NTL_MPN(mul_1) (rp, up, un, vp[0]);
	2878
	2879	while (--vn >= 1)
	2880	{
	2881	rp += 1, vp += 1;
	2882	rp[un] = NTL_MPN(addmul_1) (rp, up, un, vp[0]);
	2883	}
	2884	return rp[un];
	2885	}
	2886
	2887	void _ntl_gmul(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)
	2888	{
	2889	long sa, aneg, sb, bneg, alias, sc;
	2890	_ntl_limb_t adata, bdata, *cdata, msl;
	2891	_ntl_gbigint c;
	2892
	2893	if (ZEROP(a) \|\| ZEROP(b)) {
	2894	_ntl_gzero(cc);
	2895	return;
	2896	}
	2897
	2898	GET_SIZE_NEG(sa, aneg, a);
	2899	GET_SIZE_NEG(sb, bneg, b);
	2900
	2901	if (a != cc && b != cc) {
	2902	// no aliasing
	2903
	2904	c = *cc;
	2905
	2906	sc = sa + sb;
	2907	if (MustAlloc(c, sc)) {
	2908	_ntl_gsetlength(&c, sc);
	2909	*cc = c;
	2910	}
	2911
	2912	adata = DATA(a);
	2913	bdata = DATA(b);
	2914	cdata = DATA(c);
	2915
	2916	if (adata == bdata) {
	2917	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	2918	if (sa == 1) {
	2919	ll_type prod;
	2920	ll_mul(prod, adata[0], adata[0]);
	2921	cdata[0] = ll_get_lo(prod);
	2922	msl = cdata[1] = ll_get_hi(prod);
	2923	} else
	2924	#endif
	2925	{
	2926	NTL_MPN(sqr)(cdata, adata, sa);
	2927	msl = cdata[2*sa-1];
	2928	}
	2929	}
	2930	else {
	2931	#if 1
	2932	if (sa >= sb) {
	2933	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	2934	if (sa == 1) {
	2935	ll_type prod;
	2936	ll_mul(prod, adata[0], bdata[0]);
	2937	cdata[0] = ll_get_lo(prod);
	2938	msl = cdata[1] = ll_get_hi(prod);
	2939	} else
	2940	#endif
	2941	if (sa <= 4)
	2942	msl = base_mul(cdata, adata, sa, bdata, sb);
	2943	else
	2944	msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
	2945	}
	2946	else {
	2947	if (sb <= 4)
	2948	msl = base_mul(cdata, bdata, sb, adata, sa);
	2949	else
	2950	msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
	2951	}
	2952	#else
	2953	if (sa >= sb) {
	2954	msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
	2955	}
	2956	else {
	2957	msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
	2958	}
	2959	#endif
	2960	}
	2961
	2962	if (!msl) sc--;
	2963	if (aneg != bneg) sc = -sc;
	2964	SIZE(c) = sc;
	2965	}
	2966	else {
	2967	// aliasing
	2968	GRegister(mem);
	2969
	2970	c = mem;
	2971
	2972	sc = sa + sb;
	2973	if (MustAlloc(c, sc)) {
	2974	_ntl_gsetlength(&c, sc);
	2975	mem = c;
	2976	}
	2977
	2978	adata = DATA(a);
	2979	bdata = DATA(b);
	2980	cdata = DATA(c);
	2981
	2982	if (adata == bdata) {
	2983	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	2984	if (sa == 1) {
	2985	ll_type prod;
	2986	ll_mul(prod, adata[0], adata[0]);
	2987	cdata[0] = ll_get_lo(prod);
	2988	msl = cdata[1] = ll_get_hi(prod);
	2989	} else
	2990	#endif
	2991	{
	2992	NTL_MPN(sqr)(cdata, adata, sa);
	2993	msl = cdata[2*sa-1];
	2994	}
	2995	}
	2996	else {
	2997	#if 1
	2998	if (sa >= sb) {
	2999	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	3000	if (sa == 1) {
	3001	ll_type prod;
	3002	ll_mul(prod, adata[0], bdata[0]);
	3003	cdata[0] = ll_get_lo(prod);
	3004	msl = cdata[1] = ll_get_hi(prod);
	3005	} else
	3006	#endif
	3007	if (sa <= 4)
	3008	msl = base_mul(cdata, adata, sa, bdata, sb);
	3009	else
	3010	msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
	3011	}
	3012	else {
	3013	if (sb <= 4)
	3014	msl = base_mul(cdata, bdata, sb, adata, sa);
	3015	else
	3016	msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
	3017	}
	3018	#else
	3019	if (sa >= sb) {
	3020	msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
	3021	}
	3022	else {
	3023	msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
	3024	}
	3025	#endif
	3026	}
	3027
	3028	if (!msl) sc--;
	3029	if (aneg != bneg) sc = -sc;
	3030	SIZE(c) = sc;
	3031
	3032	_ntl_gcopy(mem, cc);
	3033	}
	3034
	3035	}
	3036
	3037	#else
2671	3038	void _ntl_gmul(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)
2672	3039	{
2673	3040	GRegister(mem);

2717	3084
2718	3085	if (alias) _ntl_gcopy(mem, cc);
2719	3086	}
	3087	#endif
2720	3088
2721	3089	void _ntl_gsq(_ntl_gbigint a, _ntl_gbigint *cc)
2722	3090	{
2723		_ntl_gmul(a, a, cc);
2724		/* this is good enough...eventually, mpn_sqr_n will be called */
	3091	long sa, aneg, alias, sc;
	3092	_ntl_limb_t adata, cdata, msl;
	3093	_ntl_gbigint c;
	3094
	3095	if (ZEROP(a)) {
	3096	_ntl_gzero(cc);
	3097	return;
	3098	}
	3099
	3100	GET_SIZE_NEG(sa, aneg, a);
	3101
	3102	if (a != *cc) {
	3103	// no aliasing
	3104
	3105	c = *cc;
	3106
	3107	sc = sa + sa;
	3108	if (MustAlloc(c, sc)) {
	3109	_ntl_gsetlength(&c, sc);
	3110	*cc = c;
	3111	}
	3112
	3113	adata = DATA(a);
	3114	cdata = DATA(c);
	3115
	3116	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	3117	if (sa == 1) {
	3118	ll_type prod;
	3119	ll_mul(prod, adata[0], adata[0]);
	3120	cdata[0] = ll_get_lo(prod);
	3121	msl = cdata[1] = ll_get_hi(prod);
	3122	} else
	3123	#endif
	3124	{
	3125	NTL_MPN(sqr)(cdata, adata, sa);
	3126	msl = cdata[2*sa-1];
	3127	}
	3128
	3129	if (!msl) sc--;
	3130	SIZE(c) = sc;
	3131	}
	3132	else {
	3133	// aliasing
	3134	GRegister(mem);
	3135
	3136	c = mem;
	3137
	3138	sc = sa + sa;
	3139	if (MustAlloc(c, sc)) {
	3140	_ntl_gsetlength(&c, sc);
	3141	mem = c;
	3142	}
	3143
	3144	adata = DATA(a);
	3145	cdata = DATA(c);
	3146
	3147	#if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
	3148	if (sa == 1) {
	3149	ll_type prod;
	3150	ll_mul(prod, adata[0], adata[0]);
	3151	cdata[0] = ll_get_lo(prod);
	3152	msl = cdata[1] = ll_get_hi(prod);
	3153	} else
	3154	#endif
	3155	{
	3156	NTL_MPN(sqr)(cdata, adata, sa);
	3157	msl = cdata[2*sa-1];
	3158	}
	3159
	3160	if (!msl) sc--;
	3161	SIZE(c) = sc;
	3162
	3163	_ntl_gcopy(mem, cc);
	3164	}
	3165
2725	3166	}
2726	3167
2727	3168

3742	4183	SIZE(d) = sd;
3743	4184	SIZE(xa) = sxa;
3744	4185
3745		/* Thes two ForceNormal's are work-arounds for GMP bugs
	4186	#if 0
	4187	// since we're now requiring GMP version 5.0.0 or later,
	4188	// these workarounds are no longer required
	4189
	4190	/* These two ForceNormal's are work-arounds for GMP bugs
3746	4191	in GMP 4.3.0 */
3747	4192	ForceNormal(d);
3748	4193	ForceNormal(xa);

3787	4232	}
3788	4233
3789	4234	/* end normalize */
	4235	#endif
3790	4236
3791	4237
3792	4238	if (aneg) _ntl_gnegate(&xa);

3858	4304	SIZE(d) = sd;
3859	4305	SIZE(u) = su;
3860	4306
	4307	#if 0
	4308	// since we're now requiring GMP version 5.0.0 or later,
	4309	// these workarounds are no longer required
	4310
3861	4311	/* Thes two ForceNormal's are work-arounds for GMP bugs
3862	4312	in GMP 4.3.0 */
3863	4313	ForceNormal(d);
3864	4314	ForceNormal(u);
	4315	#endif
3865	4316
3866	4317	if (ONEP(d)) {
3867	4318

3870	4321	* GMP is sloppy.
3871	4322	*/
3872	4323
	4324	#if 0
	4325	// since we're now requiring GMP version 5.0.0 or later,
	4326	// these workarounds are no longer required
3873	4327
3874	4328	if (_ntl_gsign(u) < 0) {
3875	4329	_ntl_gadd(u, nin, &u);

3883	4337	_ntl_gmod(u, nin, &u);
3884	4338	}
3885	4339	}
	4340	#else
	4341	if (_ntl_gsign(u) < 0) {
	4342	_ntl_gadd(u, nin, &u);
	4343	}
	4344
	4345	#endif
3886	4346
3887	4347	_ntl_gcopy(u, invv);
3888	4348	return 0;

6301	6761	case 3: ll_mul_add(acc, row[3-1], b[3-1]);
6302	6762	case 2: ll_mul_add(acc, row[2-1], b[2-1]);
6303	6763	}
	6764	#elif (CRT_ALTCODE_UNROLL)
	6765	long j = n;
	6766	for (; j > 16; j -= 16) {
	6767	ll_mul_add(acc, row[j-1], b[j-1]);
	6768	ll_mul_add(acc, row[j-2], b[j-2]);
	6769	ll_mul_add(acc, row[j-3], b[j-3]);
	6770	ll_mul_add(acc, row[j-4], b[j-4]);
	6771	ll_mul_add(acc, row[j-5], b[j-5]);
	6772	ll_mul_add(acc, row[j-6], b[j-6]);
	6773	ll_mul_add(acc, row[j-7], b[j-7]);
	6774	ll_mul_add(acc, row[j-8], b[j-8]);
	6775	ll_mul_add(acc, row[j-9], b[j-9]);
	6776	ll_mul_add(acc, row[j-10], b[j-10]);
	6777	ll_mul_add(acc, row[j-11], b[j-11]);
	6778	ll_mul_add(acc, row[j-12], b[j-12]);
	6779	ll_mul_add(acc, row[j-13], b[j-13]);
	6780	ll_mul_add(acc, row[j-14], b[j-14]);
	6781	ll_mul_add(acc, row[j-15], b[j-15]);
	6782	ll_mul_add(acc, row[j-16], b[j-16]);
	6783	}
	6784	switch (j) {
	6785	case 16: ll_mul_add(acc, row[16-1], b[16-1]);
	6786	case 15: ll_mul_add(acc, row[15-1], b[15-1]);
	6787	case 14: ll_mul_add(acc, row[14-1], b[14-1]);
	6788	case 13: ll_mul_add(acc, row[13-1], b[13-1]);
	6789	case 12: ll_mul_add(acc, row[12-1], b[12-1]);
	6790	case 11: ll_mul_add(acc, row[11-1], b[11-1]);
	6791	case 10: ll_mul_add(acc, row[10-1], b[10-1]);
	6792	case 9: ll_mul_add(acc, row[9-1], b[9-1]);
	6793	case 8: ll_mul_add(acc, row[8-1], b[8-1]);
	6794	case 7: ll_mul_add(acc, row[7-1], b[7-1]);
	6795	case 6: ll_mul_add(acc, row[6-1], b[6-1]);
	6796	case 5: ll_mul_add(acc, row[5-1], b[5-1]);
	6797	case 4: ll_mul_add(acc, row[4-1], b[4-1]);
	6798	case 3: ll_mul_add(acc, row[3-1], b[3-1]);
	6799	case 2: ll_mul_add(acc, row[2-1], b[2-1]);
	6800	}
	6801
6304	6802	#else
6305	6803	for (j = 1; j < n; j++)
6306	6804	ll_mul_add(acc, row[j], b[j]);

7178	7676	case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
7179	7677	}
7180	7678
	7679	#elif (TBL_UNROLL)
	7680	long j = sa;
	7681	for (; j > 16; j -= 16) {
	7682	ll_mul_add(acc, adata[j-1], tp[j-1]);
	7683	ll_mul_add(acc, adata[j-2], tp[j-2]);
	7684	ll_mul_add(acc, adata[j-3], tp[j-3]);
	7685	ll_mul_add(acc, adata[j-4], tp[j-4]);
	7686	ll_mul_add(acc, adata[j-5], tp[j-5]);
	7687	ll_mul_add(acc, adata[j-6], tp[j-6]);
	7688	ll_mul_add(acc, adata[j-7], tp[j-7]);
	7689	ll_mul_add(acc, adata[j-8], tp[j-8]);
	7690	ll_mul_add(acc, adata[j-9], tp[j-9]);
	7691	ll_mul_add(acc, adata[j-10], tp[j-10]);
	7692	ll_mul_add(acc, adata[j-11], tp[j-11]);
	7693	ll_mul_add(acc, adata[j-12], tp[j-12]);
	7694	ll_mul_add(acc, adata[j-13], tp[j-13]);
	7695	ll_mul_add(acc, adata[j-14], tp[j-14]);
	7696	ll_mul_add(acc, adata[j-15], tp[j-15]);
	7697	ll_mul_add(acc, adata[j-16], tp[j-16]);
	7698	}
	7699	switch (j) {
	7700	case 16: ll_mul_add(acc, adata[16-1], tp[16-1]);
	7701	case 15: ll_mul_add(acc, adata[15-1], tp[15-1]);
	7702	case 14: ll_mul_add(acc, adata[14-1], tp[14-1]);
	7703	case 13: ll_mul_add(acc, adata[13-1], tp[13-1]);
	7704	case 12: ll_mul_add(acc, adata[12-1], tp[12-1]);
	7705	case 11: ll_mul_add(acc, adata[11-1], tp[11-1]);
	7706	case 10: ll_mul_add(acc, adata[10-1], tp[10-1]);
	7707	case 9: ll_mul_add(acc, adata[9-1], tp[9-1]);
	7708	case 8: ll_mul_add(acc, adata[8-1], tp[8-1]);
	7709	case 7: ll_mul_add(acc, adata[7-1], tp[7-1]);
	7710	case 6: ll_mul_add(acc, adata[6-1], tp[6-1]);
	7711	case 5: ll_mul_add(acc, adata[5-1], tp[5-1]);
	7712	case 4: ll_mul_add(acc, adata[4-1], tp[4-1]);
	7713	case 3: ll_mul_add(acc, adata[3-1], tp[3-1]);
	7714	case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
	7715	}
	7716
7181	7717	#else
7182	7718	long j;
7183	7719	for (j = 1; j < sa; j++)

8460	8996	}
8461	8997
8462	8998
	8999	#ifdef NTL_PROVIDES_SS_LIP_IMPL
	9000
	9001	void
	9002	_ntl_leftrotate(_ntl_gbigint a, const _ntl_gbigint b, long e,
	9003	_ntl_gbigint p, long n, _ntl_gbigint *scratch)
	9004	{
	9005	if (e == 0 \|\| ZEROP(*b)) {
	9006	_ntl_gcopy(*b, a);
	9007	return;
	9008	}
	9009
	9010	long sb, nwords;
	9011
	9012	if (a == b \|\| ((unsigned long) n) % NTL_ZZ_NBITS != 0 \|\|
	9013	(sb = SIZE(*b)) == 1 + (nwords = ((unsigned long) n) / NTL_ZZ_NBITS)) {
	9014
	9015	_ntl_grshift(*b, n-e, scratch);
	9016	_ntl_glowbits(*b, n-e, a);
	9017	_ntl_glshift(*a, e, a);
	9018
	9019	if (_ntl_gcompare(a, scratch) < 0) {
	9020	_ntl_gswitchbit(a, n);
	9021	_ntl_gsadd(*a, 1, a);
	9022	_ntl_gsubpos(a, scratch, a);
	9023	}
	9024	else {
	9025	_ntl_gsubpos(a, scratch, a);
	9026	}
	9027
	9028	return;
	9029	}
	9030
	9031	long ewords = ((unsigned long) e) / NTL_ZZ_NBITS;
	9032	long ebits = ((unsigned long) e) % NTL_ZZ_NBITS;
	9033
	9034	if (MustAlloc(*a, nwords+1)) _ntl_gsetlength(a, nwords+1);
	9035
	9036	_ntl_limb_t adata = DATA(a);
	9037	_ntl_limb_t bdata = DATA(b);
	9038
	9039
	9040	long special_carry = 0;
	9041	long sa = 0;
	9042
	9043	if (ewords) {
	9044	long hiwords = sb - (nwords-ewords);
	9045	if (hiwords > 0) {
	9046
	9047	_ntl_limb_t borrow = NTL_MPN(neg)(adata, bdata + (nwords-ewords),
	9048	hiwords);
	9049	if (hiwords < ewords) {
	9050	if (borrow) {
	9051	for (long i = hiwords; i < ewords; i++)
	9052	adata[i] = _ntl_limb_t(-1);
	9053	}
	9054	else {
	9055	for (long i = hiwords; i < ewords; i++)
	9056	adata[i] = 0;
	9057	}
	9058	}
	9059
	9060	if (borrow) {
	9061	borrow = NTL_MPN(sub_1)(adata + ewords, bdata, nwords-ewords, 1);
	9062	if (borrow) {
	9063	special_carry = NTL_MPN(add_1)(adata, adata, nwords, 1);
	9064	// special case: result so far is 2^n
	9065	}
	9066	}
	9067	else {
	9068	for (long i = 0; i < nwords-ewords; i++) adata[i+ewords] = bdata[i];
	9069	}
	9070
	9071	sa = nwords;
	9072	}
	9073	else {
	9074	for (long i = 0; i < ewords; i++) adata[i] = 0;
	9075	for (long i = 0; i < sb; i++) adata[i+ewords] = bdata[i];
	9076
	9077	sa = ewords + sb;
	9078	}
	9079	}
	9080	else {
	9081	for (long i = 0; i < sb; i++) adata[i] = bdata[i];
	9082	sa = sb;
	9083	}
	9084
	9085	long here = 0;
	9086
	9087	if (ebits) {
	9088	if (special_carry) {
	9089	NTL_MPN(sub_1)(adata, adata, nwords, (1L << ebits) - 1L);
	9090	}
	9091	else if (sa == nwords) {
	9092	_ntl_limb_t shout = NTL_MPN(lshift)(adata, adata, sa, ebits);
	9093	if (shout) {
	9094	_ntl_limb_t borrow = NTL_MPN(sub_1)(adata, adata, sa, shout);
	9095	if (borrow) {
	9096	_ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, 1);
	9097	if (carry) {
	9098	adata[sa] = 1;
	9099	sa++;
	9100	}
	9101	}
	9102	}
	9103	}
	9104	else { // sa < nwords
	9105	_ntl_limb_t shout = NTL_MPN(lshift)(adata, adata, sa, ebits);
	9106	if (shout) {
	9107	adata[sa] = shout;
	9108	sa++;
	9109	}
	9110	}
	9111	}
	9112	else {
	9113	if (special_carry) {
	9114	adata[sa] = 1;
	9115	sa++;
	9116	}
	9117	}
	9118
	9119	STRIP(sa, adata);
	9120	SIZE(*a) = sa;
	9121
	9122	}
	9123
	9124	void
	9125	_ntl_ss_addmod(_ntl_gbigint x, const _ntl_gbigint a,
	9126	const _ntl_gbigint *b, _ntl_gbigint p, long n)
	9127	{
	9128	if (((unsigned long) n) % NTL_ZZ_NBITS != 0) {
	9129	_ntl_gadd(a, b, x);
	9130	if (_ntl_gcompare(*x, p) >= 0) {
	9131	_ntl_gsadd(*x, -1, x);
	9132	_ntl_gswitchbit(x, n);
	9133	}
	9134	}
	9135	else {
	9136	_ntl_gadd(a, b, x);
	9137	long sx, nwords;
	9138	if (!*x \|\|
	9139	(sx = SIZE(*x)) <= (nwords = ((unsigned long) n) / NTL_ZZ_NBITS))
	9140	return;
	9141
	9142	_ntl_limb_t xdata = DATA(x);
	9143	if (xdata[nwords] == 2) {
	9144	for (long i = 0; i < nwords; i++) xdata[i] = _ntl_limb_t(-1);
	9145	SIZE(*x) = nwords;
	9146	return;
	9147	}
	9148
	9149	long i = nwords-1;
	9150	while (i >= 0 && xdata[i] == 0) i--;
	9151	if (i < 0) return;
	9152
	9153	NTL_MPN(sub_1)(xdata, xdata, nwords, 1);
	9154	sx = nwords;
	9155	STRIP(sx, xdata);
	9156	SIZE(*x) = sx;
	9157	}
	9158	}
	9159
	9160
	9161	void
	9162	_ntl_ss_submod(_ntl_gbigint x, const _ntl_gbigint a,
	9163	const _ntl_gbigint *b, _ntl_gbigint p, long n)
	9164	{
	9165	if (((unsigned long) n) % NTL_ZZ_NBITS != 0) {
	9166	if (_ntl_gcompare(a, b) < 0) {
	9167	_ntl_gadd(*a, p, x);
	9168	_ntl_gsubpos(x, b, x);
	9169	}
	9170	else {
	9171	_ntl_gsubpos(a, b, x);
	9172	}
	9173	}
	9174	else {
	9175	if (ZEROP(*b)) {
	9176	_ntl_gcopy(*a, x);
	9177	return;
	9178	}
	9179
	9180	long sb = SIZE(*b);
	9181	_ntl_limb_t bdata = DATA(b);
	9182
	9183	long sa;
	9184
	9185	if (!*a)
	9186	sa = 0;
	9187	else
	9188	sa = SIZE(*a);
	9189
	9190	long nwords = ((unsigned long) n) / NTL_ZZ_NBITS;
	9191	if (MustAlloc(*x, nwords+1)) _ntl_gsetlength(x, nwords+1);
	9192	_ntl_limb_t xdata = DATA(x);
	9193
	9194	if (sa >= sb) {
	9195	_ntl_limb_t adata = DATA(a);
	9196	_ntl_limb_t borrow = NTL_MPN(sub)(xdata, adata, sa, bdata, sb);
	9197	if (borrow) {
	9198	for (long i = sa; i < nwords; i++) xdata[i] = _ntl_limb_t(-1);
	9199	_ntl_limb_t carry = NTL_MPN(add_1)(xdata, xdata, nwords, 1);
	9200	if (carry) {
	9201	xdata[nwords] = 1;
	9202	SIZE(*x) = nwords+1;
	9203	}
	9204	else {
	9205	long sx = nwords;
	9206	STRIP(sx, xdata);
	9207	SIZE(*x) = sx;
	9208	}
	9209	}
	9210	else {
	9211	long sx = sa;
	9212	STRIP(sx, xdata);
	9213	SIZE(*x) = sx;
	9214	}
	9215	}
	9216	else {
	9217	if (sa == 0) {
	9218	xdata[0] = 1;
	9219	}
	9220	else {
	9221	_ntl_limb_t adata = DATA(a);
	9222	xdata[sa] = NTL_MPN(add_1)(xdata, adata, sa, 1);
	9223	}
	9224	for (long i = sa+1; i <= nwords; i++) xdata[i] = 0;
	9225	xdata[nwords]++;
	9226	_ntl_limb_t borrow = NTL_MPN(sub_n)(xdata, xdata, bdata, sb);
	9227	if (borrow) {
	9228	NTL_MPN(sub_1)(xdata+sb, xdata+sb, nwords+1-sb, 1);
	9229	}
	9230	long sx = nwords+1;
	9231	STRIP(sx, xdata);
	9232	SIZE(*x) = sx;
	9233	}
	9234	}
	9235	}
	9236
	9237	#endif
	9238
	9239
	9240

-3

src/lzz_p.cpp less more

118	118	p_info_owner.make();
119	119	p_info = p_info_owner.get();
120	120
121		bool bigtab = false;
	121	long bigtab_index = -1;
122	122	#ifdef NTL_FFT_BIGTAB
123		bigtab = true;
	123	bigtab_index = 0;
124	124	#endif
125		InitFFTPrimeInfo(*p_info, q, w, bigtab);
	125	InitFFTPrimeInfo(*p_info, q, w, bigtab_index);
126	126
127	127	NumPrimes = 1;
128	128	PrimeCnt = 0;

+373

-54

src/lzz_pEX.cpp less more

1891	1891	}
1892	1892	}
1893	1893
1894		void GCD(zz_pEX& x, const zz_pEX& a, const zz_pEX& b)
	1894	void PlainGCD(zz_pEX& x, const zz_pEX& a, const zz_pEX& b)
1895	1895	{
1896	1896	zz_pE t;
1897	1897

1927	1927	}
1928	1928
1929	1929
1930
1931
	1930	class _NTL_zz_pEXMatrix {
	1931	private:
	1932
	1933	_NTL_zz_pEXMatrix(const _NTL_zz_pEXMatrix&); // disable
	1934	zz_pEX elts[2][2];
	1935
	1936	public:
	1937
	1938	_NTL_zz_pEXMatrix() { }
	1939	~_NTL_zz_pEXMatrix() { }
	1940
	1941	void operator=(const _NTL_zz_pEXMatrix&);
	1942	zz_pEX& operator() (long i, long j) { return elts[i][j]; }
	1943	const zz_pEX& operator() (long i, long j) const { return elts[i][j]; }
	1944	};
	1945
	1946
	1947	void _NTL_zz_pEXMatrix::operator=(const _NTL_zz_pEXMatrix& M)
	1948	{
	1949	elts[0][0] = M.elts[0][0];
	1950	elts[0][1] = M.elts[0][1];
	1951	elts[1][0] = M.elts[1][0];
	1952	elts[1][1] = M.elts[1][1];
	1953	}
	1954
	1955
	1956	static
	1957	void mul(zz_pEX& U, zz_pEX& V, const _NTL_zz_pEXMatrix& M)
	1958	// (U, V)^T = M*(U, V)^T
	1959	{
	1960	zz_pEX t1, t2, t3;
	1961
	1962	mul(t1, M(0,0), U);
	1963	mul(t2, M(0,1), V);
	1964	add(t3, t1, t2);
	1965	mul(t1, M(1,0), U);
	1966	mul(t2, M(1,1), V);
	1967	add(V, t1, t2);
	1968	U = t3;
	1969	}
	1970
	1971
	1972	static
	1973	void mul(_NTL_zz_pEXMatrix& A, _NTL_zz_pEXMatrix& B, _NTL_zz_pEXMatrix& C)
	1974	// A = B*C, B and C are destroyed
	1975	{
	1976	zz_pEX t1, t2;
	1977
	1978	mul(t1, B(0,0), C(0,0));
	1979	mul(t2, B(0,1), C(1,0));
	1980	add(A(0,0), t1, t2);
	1981
	1982	mul(t1, B(1,0), C(0,0));
	1983	mul(t2, B(1,1), C(1,0));
	1984	add(A(1,0), t1, t2);
	1985
	1986	mul(t1, B(0,0), C(0,1));
	1987	mul(t2, B(0,1), C(1,1));
	1988	add(A(0,1), t1, t2);
	1989
	1990	mul(t1, B(1,0), C(0,1));
	1991	mul(t2, B(1,1), C(1,1));
	1992	add(A(1,1), t1, t2);
	1993
	1994	long i, j;
	1995	for (i = 0; i < 2; i++) {
	1996	for (j = 0; j < 2; j++) {
	1997	B(i,j).kill();
	1998	C(i,j).kill();
	1999	}
	2000	}
	2001	}
	2002
	2003
	2004	void IterHalfGCD(_NTL_zz_pEXMatrix& M_out, zz_pEX& U, zz_pEX& V, long d_red)
	2005	{
	2006	M_out(0,0).SetMaxLength(d_red);
	2007	M_out(0,1).SetMaxLength(d_red);
	2008	M_out(1,0).SetMaxLength(d_red);
	2009	M_out(1,1).SetMaxLength(d_red);
	2010
	2011	set(M_out(0,0)); clear(M_out(0,1));
	2012	clear(M_out(1,0)); set(M_out(1,1));
	2013
	2014	long goal = deg(U) - d_red;
	2015
	2016	if (deg(V) <= goal)
	2017	return;
	2018
	2019	zz_pEX Q, t(INIT_SIZE, d_red);
	2020
	2021	while (deg(V) > goal) {
	2022	PlainDivRem(Q, U, U, V);
	2023	swap(U, V);
	2024
	2025	mul(t, Q, M_out(1,0));
	2026	sub(t, M_out(0,0), t);
	2027	M_out(0,0) = M_out(1,0);
	2028	M_out(1,0) = t;
	2029
	2030	mul(t, Q, M_out(1,1));
	2031	sub(t, M_out(0,1), t);
	2032	M_out(0,1) = M_out(1,1);
	2033	M_out(1,1) = t;
	2034	}
	2035	}
	2036
	2037
	2038	#define NTL_zz_pEX_HalfGCD_CROSSOVER (25)
	2039	#define NTL_zz_pEX_GCD_CROSSOVER (275)
	2040
	2041
	2042	void HalfGCD(_NTL_zz_pEXMatrix& M_out, const zz_pEX& U, const zz_pEX& V, long d_red)
	2043	{
	2044	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2045	set(M_out(0,0)); clear(M_out(0,1));
	2046	clear(M_out(1,0)); set(M_out(1,1));
	2047
	2048	return;
	2049	}
	2050
	2051
	2052	long n = deg(U) - 2*d_red + 2;
	2053	if (n < 0) n = 0;
	2054
	2055	zz_pEX U1, V1;
	2056
	2057	RightShift(U1, U, n);
	2058	RightShift(V1, V, n);
	2059
	2060	if (d_red <= NTL_zz_pEX_HalfGCD_CROSSOVER) {
	2061	IterHalfGCD(M_out, U1, V1, d_red);
	2062	return;
	2063	}
	2064
	2065	long d1 = (d_red + 1)/2;
	2066	if (d1 < 1) d1 = 1;
	2067	if (d1 >= d_red) d1 = d_red - 1;
	2068
	2069	_NTL_zz_pEXMatrix M1;
	2070
	2071	HalfGCD(M1, U1, V1, d1);
	2072	mul(U1, V1, M1);
	2073
	2074	long d2 = deg(V1) - deg(U) + n + d_red;
	2075
	2076	if (IsZero(V1) \|\| d2 <= 0) {
	2077	M_out = M1;
	2078	return;
	2079	}
	2080
	2081
	2082	zz_pEX Q;
	2083	_NTL_zz_pEXMatrix M2;
	2084
	2085	DivRem(Q, U1, U1, V1);
	2086	swap(U1, V1);
	2087
	2088	HalfGCD(M2, U1, V1, d2);
	2089
	2090	zz_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	2091
	2092	mul(t, Q, M1(1,0));
	2093	sub(t, M1(0,0), t);
	2094	swap(M1(0,0), M1(1,0));
	2095	swap(M1(1,0), t);
	2096
	2097	t.kill();
	2098
	2099	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	2100
	2101	mul(t, Q, M1(1,1));
	2102	sub(t, M1(0,1), t);
	2103	swap(M1(0,1), M1(1,1));
	2104	swap(M1(1,1), t);
	2105
	2106	t.kill();
	2107
	2108	mul(M_out, M2, M1);
	2109	}
	2110
	2111
	2112
	2113
	2114	void XHalfGCD(_NTL_zz_pEXMatrix& M_out, zz_pEX& U, zz_pEX& V, long d_red)
	2115	{
	2116	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2117	set(M_out(0,0)); clear(M_out(0,1));
	2118	clear(M_out(1,0)); set(M_out(1,1));
	2119
	2120	return;
	2121	}
	2122
	2123	long du = deg(U);
	2124
	2125	if (d_red <= NTL_zz_pEX_HalfGCD_CROSSOVER) {
	2126	IterHalfGCD(M_out, U, V, d_red);
	2127	return;
	2128	}
	2129
	2130	long d1 = (d_red + 1)/2;
	2131	if (d1 < 1) d1 = 1;
	2132	if (d1 >= d_red) d1 = d_red - 1;
	2133
	2134	_NTL_zz_pEXMatrix M1;
	2135
	2136	HalfGCD(M1, U, V, d1);
	2137	mul(U, V, M1);
	2138
	2139	long d2 = deg(V) - du + d_red;
	2140
	2141	if (IsZero(V) \|\| d2 <= 0) {
	2142	M_out = M1;
	2143	return;
	2144	}
	2145
	2146
	2147	zz_pEX Q;
	2148	_NTL_zz_pEXMatrix M2;
	2149
	2150	DivRem(Q, U, U, V);
	2151	swap(U, V);
	2152
	2153	XHalfGCD(M2, U, V, d2);
	2154
	2155	zz_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
	2156
	2157	mul(t, Q, M1(1,0));
	2158	sub(t, M1(0,0), t);
	2159	swap(M1(0,0), M1(1,0));
	2160	swap(M1(1,0), t);
	2161
	2162	t.kill();
	2163
	2164	t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
	2165
	2166	mul(t, Q, M1(1,1));
	2167	sub(t, M1(0,1), t);
	2168	swap(M1(0,1), M1(1,1));
	2169	swap(M1(1,1), t);
	2170
	2171	t.kill();
	2172
	2173	mul(M_out, M2, M1);
	2174	}
	2175
	2176	void HalfGCD(zz_pEX& U, zz_pEX& V)
	2177	{
	2178	long d_red = (deg(U)+1)/2;
	2179
	2180	if (IsZero(V) \|\| deg(V) <= deg(U) - d_red) {
	2181	return;
	2182	}
	2183
	2184	long du = deg(U);
	2185
	2186
	2187	long d1 = (d_red + 1)/2;
	2188	if (d1 < 1) d1 = 1;
	2189	if (d1 >= d_red) d1 = d_red - 1;
	2190
	2191	_NTL_zz_pEXMatrix M1;
	2192
	2193	HalfGCD(M1, U, V, d1);
	2194	mul(U, V, M1);
	2195
	2196	long d2 = deg(V) - du + d_red;
	2197
	2198	if (IsZero(V) \|\| d2 <= 0) {
	2199	return;
	2200	}
	2201
	2202	M1(0,0).kill();
	2203	M1(0,1).kill();
	2204	M1(1,0).kill();
	2205	M1(1,1).kill();
	2206
	2207
	2208	zz_pEX Q;
	2209
	2210	DivRem(Q, U, U, V);
	2211	swap(U, V);
	2212
	2213	HalfGCD(M1, U, V, d2);
	2214
	2215	mul(U, V, M1);
	2216	}
	2217
	2218
	2219	void GCD(zz_pEX& d, const zz_pEX& u, const zz_pEX& v)
	2220	{
	2221	zz_pEX u1, v1;
	2222
	2223	u1 = u;
	2224	v1 = v;
	2225
	2226	if (deg(u1) == deg(v1)) {
	2227	if (IsZero(u1)) {
	2228	clear(d);
	2229	return;
	2230	}
	2231
	2232	rem(v1, v1, u1);
	2233	}
	2234	else if (deg(u1) < deg(v1)) {
	2235	swap(u1, v1);
	2236	}
	2237
	2238	// deg(u1) > deg(v1)
	2239
	2240	while (deg(u1) > NTL_zz_pEX_GCD_CROSSOVER && !IsZero(v1)) {
	2241	HalfGCD(u1, v1);
	2242
	2243	if (!IsZero(v1)) {
	2244	rem(u1, u1, v1);
	2245	swap(u1, v1);
	2246	}
	2247	}
	2248
	2249	PlainGCD(d, u1, v1);
	2250	}
	2251
1932	2252
1933	2253	void XGCD(zz_pEX& d, zz_pEX& s, zz_pEX& t, const zz_pEX& a, const zz_pEX& b)
1934	2254	{
1935		zz_pE z;
1936
1937
1938		if (IsZero(b)) {
	2255	zz_pE w;
	2256
	2257	if (IsZero(a) && IsZero(b)) {
	2258	clear(d);
1939	2259	set(s);
1940	2260	clear(t);
1941		d = a;
1942		}
1943		else if (IsZero(a)) {
1944		clear(s);
1945		set(t);
1946		d = b;
1947		}
1948		else {
1949		long e = max(deg(a), deg(b)) + 1;
1950
1951		zz_pEX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1952		u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1953		u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1954		u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1955
1956
1957		set(u1); clear(v1);
1958		clear(u2); set(v2);
1959		u = a; v = b;
1960
1961		do {
1962		DivRem(q, u, u, v);
1963		swap(u, v);
1964		u0 = u2;
1965		v0 = v2;
1966		mul(temp, q, u2);
1967		sub(u2, u1, temp);
1968		mul(temp, q, v2);
1969		sub(v2, v1, temp);
1970		u1 = u0;
1971		v1 = v0;
1972		} while (!IsZero(v));
1973
1974		d = u;
1975		s = u1;
1976		t = v1;
1977		}
1978
1979		if (IsZero(d)) return;
1980		if (IsOne(LeadCoeff(d))) return;
1981
1982		/* make gcd monic */
1983
1984		inv(z, LeadCoeff(d));
1985		mul(d, d, z);
1986		mul(s, s, z);
1987		mul(t, t, z);
	2261	return;
	2262	}
	2263
	2264	zz_pEX U, V, Q;
	2265
	2266	U = a;
	2267	V = b;
	2268
	2269	long flag = 0;
	2270
	2271	if (deg(U) == deg(V)) {
	2272	DivRem(Q, U, U, V);
	2273	swap(U, V);
	2274	flag = 1;
	2275	}
	2276	else if (deg(U) < deg(V)) {
	2277	swap(U, V);
	2278	flag = 2;
	2279	}
	2280
	2281	_NTL_zz_pEXMatrix M;
	2282
	2283	XHalfGCD(M, U, V, deg(U)+1);
	2284
	2285	d = U;
	2286
	2287	if (flag == 0) {
	2288	s = M(0,0);
	2289	t = M(0,1);
	2290	}
	2291	else if (flag == 1) {
	2292	s = M(0,1);
	2293	mul(t, Q, M(0,1));
	2294	sub(t, M(0,0), t);
	2295	}
	2296	else { /* flag == 2 */
	2297	s = M(0,1);
	2298	t = M(0,0);
	2299	}
	2300
	2301	// normalize
	2302
	2303	inv(w, LeadCoeff(d));
	2304	mul(d, d, w);
	2305	mul(s, s, w);
	2306	mul(t, t, w);
1988	2307	}
1989	2308
1990	2309

+108

-0

src/lzz_pEXGCDTest.cpp less more

	0	#include <NTL/lzz_pXFactoring.h>
	1	#include <NTL/lzz_pEX.h>
	2
	3	NTL_CLIENT
	4
	5
	6
	7	void test(zz_pX& P, zz_pEX& f, zz_pEX& g, zz_pEX& h, zz_pEX& hx, zz_pEX& s, zz_pEX& t)
	8	{
	9	/* P is the polynomial of the extension
	10	* f and g the polynomials
	11	* h the gcd
	12	* hx the gcd obtained using XGCD
	13	* s, t are Bezout coefficients hx=fs+gt
	14	*/
	15	zz_pEX htest,rf,rg;
	16
	17	if (h!=hx){
	18	cout << P << "\n" << f << "\n" << g << "\n";
	19	Error("different gcd:\n");
	20	}
	21
	22	if (max(deg(f), deg(g)) > 0 \|\| min(deg(f), deg(g)) >= 0) {
	23	if (deg(s) >= deg(g) \|\| deg(t) >= deg(f)) {
	24	cout << P << "\n" << f << "\n" << g << "\n";
	25	Error("degree bounds at fault:\n");
	26	}
	27	}
	28
	29
	30	mul(s,s,f);
	31	mul(t,t,g);
	32	add(htest,t,s);
	33	if (h!=htest){
	34	cout << P << "\n" << f << "\n" << g << "\n";
	35	Error("xgcd at fault:\n");
	36	}
	37	if (!IsZero(h)){
	38	rem(rf,f,h);
	39	rem(rg,f,h);
	40	if ((!IsZero(rf))\|\|(!IsZero(rg))){
	41	cout << P << "\n" << f << "\n" << g << "\n";
	42	Error("not a common divisor\n");
	43	}
	44	}else{
	45	if (!IsZero(f) && !IsZero(g)){
	46	cout << "debug:\n";
	47	cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
	48	Error("ooops:\n");
	49	}
	50	}
	51	}
	52
	53
	54	int main()
	55	{
	56
	57	long prime = 17;
	58
	59	zz_p::init(prime);
	60
	61	zz_pX P;
	62
	63	BuildIrred(P, 5);
	64
	65	zz_pE::init(P);
	66
	67	for (long i = 0; i < 400; i++) {
	68	if (i%10 == 0) cerr << ".";
	69	zz_pEX f,g,h,s,t,hx;
	70
	71	long deg_h;
	72	if (RandomBnd(2))
	73	deg_h = RandomBnd(10)+1;
	74	else
	75	deg_h = RandomBnd(500)+1;
	76
	77	random(h, deg_h);
	78	SetCoeff(h, deg_h);
	79
	80	long deg_f;
	81	if (RandomBnd(2))
	82	deg_f = RandomBnd(10)+1;
	83	else
	84	deg_f = RandomBnd(1000)+1;
	85
	86	random(f, deg_f);
	87	f *= h;
	88
	89	long deg_g;
	90	if (RandomBnd(2))
	91	deg_g = RandomBnd(10)+1;
	92	else
	93	deg_g = RandomBnd(1000)+1;
	94
	95	random(g, deg_g);
	96	g *= h;
	97
	98	h = 0;
	99
	100	GCD(h, f, g);
	101	XGCD(hx, s, t, f, g);
	102	test(P, f, g, h, hx, s, t);
	103	}
	104
	105	cerr << "\n";
	106
	107	}

+123

-74

src/lzz_pX.cpp less more

0	0
1	1	#include <NTL/lzz_pX.h>
	2	#include <NTL/FFT_impl.h>
2	3
3	4
4	5	NTL_START_IMPL

1397	1398
1398	1399	if (R.k < 0) {
1399	1400	k = -1;
	1401	len = 0;
1400	1402	return *this;
1401	1403	}
1402	1404
1403	1405	DoSetSize(R.k, R.NumPrimes);
1404		long i, j, n;
1405
1406		n = 1L << k;
	1406	len = R.len;
	1407
	1408	long i, j;
1407	1409
1408	1410	for (i = 0; i < NumPrimes; i++)
1409		for (j = 0; j < n; j++)
	1411	for (j = 0; j < len; j++)
1410	1412	tbl[i][j] = R.tbl[i][j];
1411	1413
1412	1414	return *this;

1615	1617
1616	1618
1617	1619
1618		void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
	1620	void TofftRep_trunc(fftRep& y, const zz_pX& x, long k,
	1621	long len, long lo, long hi)
1619	1622	// computes an n = 2^k point convolution.
1620	1623	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1621	1624	{

1636	1639	hi = min(hi, deg(x));
1637	1640
1638	1641	y.SetSize(k);
1639
1640	1642	n = 1L << k;
1641	1643
	1644	y.len = len = FFTRoundUp(len, k);
	1645
1642	1646	m = max(hi-lo + 1, 0);
	1647	long ilen = FFTRoundUp(m, k);
1643	1648
1644	1649	const zz_p *xx = x.rep.elts();
1645	1650

1651	1656	for (j = 0; j < m; j++) {
1652	1657	yp[j] = rep(xx[j+lo]);
1653	1658	}
1654		for (j = m; j < n; j++) {
	1659	for (j = m; j < ilen; j++) {
1655	1660	yp[j] = 0;
1656	1661	}
1657	1662	}

1674	1679	t = sp_CorrectExcess(t, q);
1675	1680	yp[j] = t;
1676	1681	}
1677		for (j = m; j < n; j++) {
	1682	for (j = m; j < ilen; j++) {
1678	1683	yp[j] = 0;
1679	1684	}
1680	1685	}

1697	1702
1698	1703	if (p_info) {
1699	1704	long *yp = &y.tbl[0][0];
1700		FFTFwd(yp, yp, k, *p_info);
	1705	FFTFwd_trunc(yp, yp, k, *p_info, len, ilen);
1701	1706	}
1702	1707	else {
1703	1708	for (i = 0; i < nprimes; i++) {
1704	1709	long *yp = &y.tbl[i][0];
1705		FFTFwd(yp, yp, k, i);
	1710	FFTFwd_trunc(yp, yp, k, i, len, ilen);
1706	1711	}
1707	1712	}
1708	1713	}

1733	1738	y.SetSize(k);
1734	1739
1735	1740	n = 1L << k;
	1741	y.len = n;
1736	1742
1737	1743	m = max(hi-lo + 1, 0);
1738	1744

1780	1786
1781	1787	if (p_info) {
1782	1788	long *yp = &y.tbl[0][0];
1783		FFTRev1(yp, yp, k, *p_info);
	1789	FFTRev1_trans(yp, yp, k, *p_info);
1784	1790	}
1785	1791	else {
1786	1792	for (i = 0; i < info->NumPrimes; i++) {
1787	1793	long *yp = &y.tbl[i][0];
1788		FFTRev1(yp, yp, k, i);
	1794	FFTRev1_trans(yp, yp, k, i);
1789	1795	}
1790	1796	}
1791	1797	}

1806	1812	k = y.k;
1807	1813	n = (1L << k);
1808	1814
	1815	hi = min(hi, n-1);
	1816	l = hi-lo+1;
	1817	l = max(l, 0);
	1818
	1819	long len = y.len;
	1820	if (len <= hi) LogicError("FromfftRep: bad len");
	1821
1809	1822	FFTPrimeInfo *p_info = info->p_info;
1810	1823
1811	1824	if (p_info) {
1812	1825	long *yp = &y.tbl[0][0];
1813		FFTRev1(yp, yp, k, *p_info);
	1826	FFTRev1_trunc(yp, yp, k, *p_info, len);
1814	1827	}
1815	1828	else {
1816	1829	for (i = 0; i < NumPrimes; i++) {
1817	1830	long *yp = &y.tbl[i][0];
1818		FFTRev1(yp, yp, k, i);
1819		}
1820		}
1821
1822		hi = min(hi, n-1);
1823		l = hi-lo+1;
1824		l = max(l, 0);
	1831	FFTRev1_trunc(yp, yp, k, i, len);
	1832	}
	1833	}
	1834
1825	1835	x.rep.SetLength(l);
1826	1836
1827	1837	if (p_info) {

1854	1864	k = y.k;
1855	1865	n = (1L << k);
1856	1866
	1867	if (y.len != n) LogicError("RevFromfftRep: bad len");
	1868
1857	1869	FFTPrimeInfo *p_info = info->p_info;
1858	1870
1859	1871	if (p_info) {
1860	1872	long *yp = &y.tbl[0][0];
1861		FFTFwd(yp, yp, k, *p_info);
	1873	FFTFwd_trans(yp, yp, k, *p_info);
1862	1874	}
1863	1875	else {
1864	1876	for (i = 0; i < NumPrimes; i++) {
1865	1877	long *yp = &y.tbl[i][0];
1866		FFTFwd(yp, yp, k, i);
	1878	FFTFwd_trans(yp, yp, k, i);
1867	1879	}
1868	1880	}
1869	1881

1894	1906	k = y.k;
1895	1907	n = (1L << k);
1896	1908
	1909	hi = min(hi, n-1);
	1910	l = hi-lo+1;
	1911	l = max(l, 0);
	1912
	1913	long len = y.len;
	1914	if (len <= hi) LogicError("FromfftRep: bad len");
	1915
1897	1916	z.SetSize(k);
1898	1917
1899	1918	FFTPrimeInfo *p_info = info->p_info;

1901	1920	if (p_info) {
1902	1921	long *zp = &z.tbl[0][0];
1903	1922	const long *yp = &y.tbl[0][0];
1904		FFTRev1(zp, yp, k, *p_info);
	1923	FFTRev1_trunc(zp, yp, k, *p_info, len);
1905	1924	}
1906	1925	else {
1907	1926	for (i = 0; i < NumPrimes; i++) {
1908	1927	long *zp = &z.tbl[i][0];
1909	1928	const long *yp = &y.tbl[i][0];
1910		FFTRev1(zp, yp, k, i);
1911		}
1912		}
1913
1914		hi = min(hi, n-1);
1915		l = hi-lo+1;
1916		l = max(l, 0);
	1929	FFTRev1_trunc(zp, yp, k, i, len);
	1930	}
	1931	}
	1932
1917	1933	x.rep.SetLength(l);
1918	1934
1919	1935	if (p_info) {

1950	1966
1951	1967	k = y.k;
1952	1968	n = (1L << k);
	1969
	1970
	1971	//if (y.len <= min(hi, n-1)) LogicError("FromfftRep: bad len");
	1972	if (y.len != n) LogicError("FromfftRep: bad len");
1953	1973
1954	1974	FFTPrimeInfo *p_info = info->p_info;
1955	1975

1995	2015
1996	2016	z.SetSize(k);
1997	2017
	2018	long len = z.len = min(x.len, y.len);
	2019
1998	2020	FFTPrimeInfo *p_info = info->p_info;
1999	2021
2000	2022	if (p_info) {

2005	2027	mulmod_t qinv = p_info->qinv;
2006	2028
2007	2029	if (NormalizedModulus(qinv)) {
2008		for (j = 0; j < n; j++)
	2030	for (j = 0; j < len; j++)
2009	2031	zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
2010	2032	}
2011	2033	else {
2012		for (j = 0; j < n; j++)
	2034	for (j = 0; j < len; j++)
2013	2035	zp[j] = MulMod(xp[j], yp[j], q, qinv);
2014	2036	}
2015	2037	}

2021	2043	long q = GetFFTPrime(i);
2022	2044	mulmod_t qinv = GetFFTPrimeInv(i);
2023	2045
2024		for (j = 0; j < n; j++)
	2046	for (j = 0; j < len; j++)
2025	2047	zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
2026	2048	}
2027	2049	}

2039	2061	n = 1L << k;
2040	2062
2041	2063	z.SetSize(k);
	2064
	2065	long len = z.len = min(x.len, y.len);
2042	2066
2043	2067	FFTPrimeInfo *p_info = info->p_info;
2044	2068

2048	2072	const long *yp = &y.tbl[0][0];
2049	2073	long q = p_info->q;
2050	2074
2051		for (j = 0; j < n; j++)
	2075	for (j = 0; j < len; j++)
2052	2076	zp[j] = SubMod(xp[j], yp[j], q);
2053	2077	}
2054	2078	else {

2058	2082	const long *yp = &y.tbl[i][0];
2059	2083	long q = GetFFTPrime(i);
2060	2084
2061		for (j = 0; j < n; j++)
	2085	for (j = 0; j < len; j++)
2062	2086	zp[j] = SubMod(xp[j], yp[j], q);
2063	2087	}
2064	2088	}

2076	2100	n = 1L << k;
2077	2101
2078	2102	z.SetSize(k);
	2103
	2104	long len = z.len = min(x.len, y.len);
2079	2105
2080	2106	FFTPrimeInfo *p_info = info->p_info;
2081	2107

2085	2111	const long *yp = &y.tbl[0][0];
2086	2112	long q = p_info->q;
2087	2113
2088		for (j = 0; j < n; j++)
	2114	for (j = 0; j < len; j++)
2089	2115	zp[j] = AddMod(xp[j], yp[j], q);
2090	2116	}
2091	2117	else {

2095	2121	const long *yp = &y.tbl[i][0];
2096	2122	long q = GetFFTPrime(i);
2097	2123
2098		for (j = 0; j < n; j++)
	2124	for (j = 0; j < len; j++)
2099	2125	zp[j] = AddMod(xp[j], yp[j], q);
2100	2126	}
2101	2127	}

2116	2142	n = 1L << k;
2117	2143
2118	2144	if (l < k) LogicError("reduce: bad operands");
	2145	if (a.len < n) LogicError("reduce: bad len");
2119	2146
2120	2147	x.SetSize(k);
	2148	x.len = n;
	2149
	2150	if (&x == &a) return;
2121	2151
2122	2152	for (i = 0; i < info->NumPrimes; i++) {
2123	2153	ap = &a.tbl[i][0];
2124	2154	xp = &x.tbl[i][0];
2125	2155	for (j = 0; j < n; j++)
2126		xp[j] = ap[j << (l-k)];
2127		}
2128		}
	2156	xp[j] = ap[j];
	2157	}
	2158	}
	2159
2129	2160
2130	2161	void AddExpand(fftRep& x, const fftRep& a)
2131	2162	// x = x + (an "expanded" version of a)

2139	2170	n = 1L << k;
2140	2171
2141	2172	if (l < k) LogicError("AddExpand: bad args");
	2173	if (x.len < n) LogicError("AddExpand: bad len");
2142	2174
2143	2175	FFTPrimeInfo *p_info = info->p_info;
2144	2176

2147	2179	const long *ap = &a.tbl[0][0];
2148	2180	long *xp = &x.tbl[0][0];
2149	2181	for (j = 0; j < n; j++) {
2150		long j1 = j << (l-k);
2151		xp[j1] = AddMod(xp[j1], ap[j], q);
	2182	xp[j] = AddMod(xp[j], ap[j], q);
2152	2183	}
2153	2184	}
2154	2185	else {

2157	2188	const long *ap = &a.tbl[i][0];
2158	2189	long *xp = &x.tbl[i][0];
2159	2190	for (j = 0; j < n; j++) {
2160		long j1 = j << (l-k);
2161		xp[j1] = AddMod(xp[j1], ap[j], q);
	2191	xp[j] = AddMod(xp[j], ap[j], q);
2162	2192	}
2163	2193	}
2164	2194	}
2165	2195	}
2166	2196
2167	2197
2168
2169	2198	void FFTMul(zz_pX& x, const zz_pX& a, const zz_pX& b)
2170	2199	{
2171		long k, d;
2172
2173	2200	if (IsZero(a) \|\| IsZero(b)) {
2174	2201	clear(x);
2175	2202	return;
2176	2203	}
2177	2204
2178		d = deg(a) + deg(b);
2179		k = NextPowerOfTwo(d+1);
	2205	long da = deg(a);
	2206	long db = deg(b);
	2207	long d = da+db;
	2208	long k = NextPowerOfTwo(d+1);
2180	2209
2181	2210	fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, k);
2182	2211
2183		TofftRep(R1, a, k);
2184		TofftRep(R2, b, k);
	2212	TofftRep_trunc(R1, a, k, d+1);
	2213	TofftRep_trunc(R2, b, k, d+1);
2185	2214	mul(R1, R1, R2);
2186	2215	FromfftRep(x, R1, 0, d);
2187	2216	}
2188	2217
2189	2218	void FFTSqr(zz_pX& x, const zz_pX& a)
2190	2219	{
2191		long k, d;
2192
2193	2220	if (IsZero(a)) {
2194	2221	clear(x);
2195	2222	return;
2196	2223	}
2197	2224
2198		d = 2*deg(a);
2199		k = NextPowerOfTwo(d+1);
	2225	long da = deg(a);
	2226	long d = 2*da;
	2227	long k = NextPowerOfTwo(d+1);
2200	2228
2201	2229	fftRep R1(INIT_SIZE, k);
2202	2230
2203		TofftRep(R1, a, k);
	2231	TofftRep_trunc(R1, a, k, d+1);
2204	2232	mul(R1, R1, R1);
2205	2233	FromfftRep(x, R1, 0, d);
2206	2234	}

2285	2313	fftRep R1(INIT_SIZE, F.l);
2286	2314	zz_pX P1(INIT_SIZE, n);
2287	2315
2288		TofftRep(R1, a, F.l, n, 2*(n-1));
	2316	TofftRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
2289	2317	mul(R1, R1, F.HRep);
2290	2318	FromfftRep(P1, R1, n-2, 2*n-4);
2291	2319

2341	2369	fftRep R1(INIT_SIZE, F.l);
2342	2370	zz_pX P1(INIT_SIZE, n), qq;
2343	2371
2344		TofftRep(R1, a, F.l, n, 2*(n-1));
	2372	TofftRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
2345	2373	mul(R1, R1, F.HRep);
2346	2374	FromfftRep(P1, R1, n-2, 2*n-4);
2347	2375	qq = P1;

2397	2425	fftRep R1(INIT_SIZE, F.l);
2398	2426	zz_pX P1(INIT_SIZE, n);
2399	2427
2400		TofftRep(R1, a, F.l, n, 2*(n-1));
	2428	TofftRep_trunc(R1, a, F.l, 2n-3, n, 2(n-1));
2401	2429	mul(R1, R1, F.HRep);
2402	2430	FromfftRep(x, R1, n-2, 2*n-4);
2403	2431	}

2597	2625	fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
2598	2626	zz_pX P1(INIT_SIZE, n);
2599	2627
2600		TofftRep(R1, a, k);
2601		TofftRep(R2, b, k);
2602
	2628	long len;
	2629	if (zz_p::IsFFTPrime())
	2630	len = n;
	2631	else
	2632	len = 1L << F.k;
	2633
	2634	TofftRep_trunc(R1, a, k, max(1L << F.k, d));
	2635	TofftRep_trunc(R2, b, k, max(1L << F.k, d));
2603	2636	mul(R1, R1, R2);
2604
2605	2637	NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
2606
2607		TofftRep(R2, P1, F.l);
	2638
	2639	TofftRep_trunc(R2, P1, F.l, 2*n-3);
2608	2640	mul(R2, R2, F.HRep);
2609	2641	FromfftRep(P1, R2, n-2, 2*n-4);
2610	2642
2611		TofftRep(R2, P1, F.k);
	2643	TofftRep_trunc(R2, P1, F.k, len);
2612	2644	mul(R2, R2, F.FRep);
2613	2645	reduce(R1, R1, F.k);
2614	2646	sub(R1, R1, R2);

2643	2675	fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
2644	2676	zz_pX P1(INIT_SIZE, n);
2645	2677
2646		TofftRep(R1, a, k);
	2678	long len;
	2679	if (zz_p::IsFFTPrime())
	2680	len = n;
	2681	else
	2682	len = 1L << F.k;
	2683
	2684	TofftRep_trunc(R1, a, k, max(1L << F.k, d));
2647	2685	mul(R1, R1, R1);
2648		NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
2649
2650		TofftRep(R2, P1, F.l);
	2686	NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
	2687
	2688	TofftRep_trunc(R2, P1, F.l, 2*n-3);
2651	2689	mul(R2, R2, F.HRep);
2652	2690	FromfftRep(P1, R2, n-2, 2*n-4);
2653	2691
2654		TofftRep(R2, P1, F.k);
	2692	TofftRep_trunc(R2, P1, F.k, len);
2655	2693	mul(R2, R2, F.FRep);
2656	2694	reduce(R1, R1, F.k);
2657	2695	sub(R1, R1, R2);

2864	2902	zz_pX P1(INIT_SIZE, n);
2865	2903
2866	2904
2867		TofftRep(R1, b, F.l);
	2905	TofftRep_trunc(R1, b, F.l, 2*n-2);
2868	2906	reduce(x.B2, R1, F.k);
2869	2907	mul(R1, R1, F.HRep);
2870	2908	FromfftRep(P1, R1, n-1, 2*n-3);
	2909
2871	2910	TofftRep(x.B1, P1, F.l);
	2911	// could be truncated to length max(1L << F.k, 2*n-2), except
	2912	// for the usage in UpdateMap, where we would have to investigate
	2913	// further
	2914
2872	2915	}
2873	2916
2874	2917

2899	2942	zz_pX P1(INIT_SIZE, n), P2(INIT_SIZE, n);
2900	2943	fftRep R1(INIT_SIZE, F.l), R2(INIT_SIZE, F.l);
2901	2944
2902		TofftRep(R1, a, F.l);
	2945	long len;
	2946	if (zz_p::IsFFTPrime())
	2947	len = n;
	2948	else
	2949	len = 1L << F.k;
	2950
	2951	TofftRep_trunc(R1, a, F.l, max(1L << F.k, 2*n-2));
2903	2952	mul(R2, R1, B.B1);
2904	2953	FromfftRep(P1, R2, n-1, 2*n-3);
2905	2954
2906	2955	reduce(R1, R1, F.k);
2907	2956	mul(R1, R1, B.B2);
2908		TofftRep(R2, P1, F.k);
	2957	TofftRep_trunc(R2, P1, F.k, len);
2909	2958	mul(R2, R2, F.FRep);
2910	2959	sub(R1, R1, R2);
2911	2960

+380

-0

src/lzz_pXTest.cpp less more

	0	#include <NTL/lzz_pX.h>
	1
	2	NTL_CLIENT
	3
	4	#define ITER (500)
	5
	6	void multest()
	7	{
	8	cerr << "mul";
	9	for (long iter = 0; iter < ITER; iter++) {
	10	if (iter % 100 == 0) cerr << ".";
	11
	12	long da = RandomBnd(5000) + 100;
	13	long db = RandomBnd(5000) + 100;
	14
	15	zz_pX a, b, c1, c2;
	16
	17	random(a, da);
	18	random(b, db);
	19
	20	if (deg(a) < 80 \|\| deg(b) < 80) {
	21	cerr << "*";
	22	continue;
	23	}
	24
	25	FFTMul(c1, a, b);
	26	PlainMul(c2, a, b);
	27
	28	if (c1 != c2) {
	29	cerr << "******* oops\n";
	30	break;
	31	}
	32	}
	33
	34	cerr << "\n";
	35	}
	36
	37
	38	void sqrtest()
	39	{
	40	cerr << "sqr";
	41	for (long iter = 0; iter < ITER; iter++) {
	42	if (iter % 100 == 0) cerr << ".";
	43
	44	long da = RandomBnd(5000) + 100;
	45	long db = RandomBnd(5000) + 100;
	46
	47	zz_pX a, b, c1, c2;
	48
	49	random(a, da);
	50
	51	if (deg(a) < 80) {
	52	cerr << "*";
	53	continue;
	54	}
	55
	56	FFTSqr(c1, a);
	57	PlainMul(c2, a, a);
	58
	59	if (c1 != c2) {
	60	cerr << "******* oops\n";
	61	break;
	62	}
	63	}
	64
	65	cerr << "\n";
	66	}
	67
	68
	69
	70
	71	void mulmodtest()
	72	{
	73	cerr << "mulmod";
	74	for (long iter = 0; iter < ITER; iter++) {
	75	if (iter % 100 == 0) cerr << ".";
	76
	77	long n = RandomBnd(5000) + 300;
	78	long da = RandomBnd(n)+1;
	79	long db = RandomBnd(n)+1;
	80
	81	if (RandomBnd(2)) { da = n; db = n; }
	82
	83	zz_pX f;
	84	random(f, n);
	85	SetCoeff(f, n);
	86	zz_pXModulus F(f);
	87
	88	zz_pX a, b, c1, c2;
	89	random(a, da);
	90	random(b, db);
	91
	92	MulMod(c1, a, b, F);
	93	PlainMul(c2, a, b);
	94	rem(c2, c2, f);
	95
	96	if (c1 != c2) {
	97	cerr << "******** oops\n";
	98	break;
	99	}
	100	}
	101
	102	cerr << "\n";
	103	}
	104
	105
	106	void sqrmodtest()
	107	{
	108	cerr << "sqrmod";
	109	for (long iter = 0; iter < ITER; iter++) {
	110	if (iter % 100 == 0) cerr << ".";
	111
	112	long n = RandomBnd(5000) + 300;
	113	long da = RandomBnd(n)+1;
	114	long db = RandomBnd(n)+1;
	115
	116	if (RandomBnd(2)) { da = n; db = n; }
	117
	118	zz_pX f;
	119	random(f, n);
	120	SetCoeff(f, n);
	121	zz_pXModulus F(f);
	122
	123	zz_pX a, b, c1, c2;
	124	random(a, da);
	125	random(b, db);
	126
	127	SqrMod(c1, a, F);
	128
	129	PlainMul(c2, a, a);
	130	rem(c2, c2, f);
	131
	132	if (c1 != c2) {
	133	cerr << "******** oops\n";
	134	break;
	135	}
	136	}
	137
	138	cerr << "\n";
	139	}
	140
	141
	142
	143	void mulmod1test()
	144	{
	145	cerr << "mulmod1";
	146	for (long iter = 0; iter < ITER; iter++) {
	147	if (iter % 100 == 0) cerr << ".";
	148
	149	long n = RandomBnd(5000) + 300;
	150	long da = RandomBnd(n)+1;
	151	long db = RandomBnd(n)+1;
	152
	153	if (RandomBnd(2)) { da = n; db = n; }
	154
	155	zz_pX f;
	156	random(f, n);
	157	SetCoeff(f, n);
	158	zz_pXModulus F(f);
	159
	160	zz_pX a, b, c1, c2;
	161	random(a, da);
	162	random(b, db);
	163
	164	zz_pXMultiplier bb;
	165	build(bb, b, F);
	166
	167	MulMod(c1, a, bb, F);
	168
	169	PlainMul(c2, a, b);
	170	rem(c2, c2, f);
	171
	172	if (c1 != c2) {
	173	cerr << "******** oops\n";
	174	break;
	175	}
	176	}
	177
	178	cerr << "\n";
	179	}
	180
	181
	182	namespace NTL {
	183
	184	void CopyReverse(zz_pX& x, const zz_pX& a, long lo, long hi);
	185
	186	}
	187
	188
	189
	190	struct zz_pXTransMultiplier {
	191	zz_pX f0, fbi, b;
	192	long shamt, shamt_fbi, shamt_b;
	193	};
	194
	195
	196
	197
	198	void build(zz_pXTransMultiplier& B, const zz_pX& b, const zz_pXModulus& F)
	199	{
	200	long db = deg(b);
	201
	202	if (db >= F.n) LogicError("build TransMultiplier: bad args");
	203
	204	zz_pX t;
	205
	206	LeftShift(t, b, F.n-1);
	207	div(t, t, F);
	208
	209	// we optimize for low degree b
	210
	211	long d;
	212
	213	d = deg(t);
	214	if (d < 0)
	215	B.shamt_fbi = 0;
	216	else
	217	B.shamt_fbi = F.n-2 - d;
	218
	219	CopyReverse(B.fbi, t, 0, d);
	220
	221	// The following code optimizes the case when
	222	// f = X^n + low degree poly
	223
	224	trunc(t, F.f, F.n);
	225	d = deg(t);
	226	if (d < 0)
	227	B.shamt = 0;
	228	else
	229	B.shamt = d;
	230
	231	CopyReverse(B.f0, t, 0, d);
	232
	233	if (db < 0)
	234	B.shamt_b = 0;
	235	else
	236	B.shamt_b = db;
	237
	238	CopyReverse(B.b, b, 0, db);
	239	}
	240
	241
	242
	243	void TransMulMod(zz_pX& x, const zz_pX& a, const zz_pXTransMultiplier& B,
	244	const zz_pXModulus& F)
	245	{
	246	if (deg(a) >= F.n) LogicError("TransMulMod: bad args");
	247
	248	zz_pX t1, t2;
	249
	250	mul(t1, a, B.b);
	251	RightShift(t1, t1, B.shamt_b);
	252
	253	mul(t2, a, B.f0);
	254	RightShift(t2, t2, B.shamt);
	255	trunc(t2, t2, F.n-1);
	256
	257	mul(t2, t2, B.fbi);
	258	if (B.shamt_fbi > 0) LeftShift(t2, t2, B.shamt_fbi);
	259	trunc(t2, t2, F.n-1);
	260	LeftShift(t2, t2, 1);
	261
	262	sub(x, t1, t2);
	263	}
	264
	265
	266
	267	void UpdateMap(vec_zz_p& x, const vec_zz_p& a,
	268	const zz_pXTransMultiplier& B, const zz_pXModulus& F)
	269	{
	270	zz_pX xx;
	271	TransMulMod(xx, to_zz_pX(a), B, F);
	272	x = xx.rep;
	273	}
	274
	275
	276
	277	void updatetest()
	278	{
	279	cerr << "update";
	280	for (long iter = 0; iter < ITER; iter++) {
	281	if (iter % 100 == 0) cerr << ".";
	282
	283	long n = RandomBnd(5000) + 300;
	284	long da = RandomBnd(n)+1;
	285	long db = RandomBnd(n)+1;
	286
	287	if (RandomBnd(2)) { da = n; db = n; }
	288
	289	zz_pX f;
	290	random(f, n);
	291	SetCoeff(f, n);
	292	zz_pXModulus F(f);
	293
	294	zz_pX a, b;
	295	random(a, da);
	296	random(b, db);
	297
	298	zz_pXMultiplier bb1;
	299	build(bb1, b, F);
	300
	301	zz_pXTransMultiplier bb2;
	302	build(bb2, b, F);
	303
	304	Vec<zz_p> x1, x2;
	305
	306	UpdateMap(x1, a.rep, bb1, F);
	307	UpdateMap(x2, a.rep, bb2, F);
	308
	309
	310	if (x1 != x2) {
	311	cerr << "******** oops\n";
	312	break;
	313	}
	314	}
	315
	316	cerr << "\n";
	317	}
	318
	319	void divremtest()
	320	{
	321	cerr << "divrem";
	322	for (long iter = 0; iter < ITER; iter++) {
	323	if (iter % 100 == 0) cerr << ".";
	324
	325	long n = RandomBnd(5000) + 300;
	326	long dq = RandomBnd(n);
	327
	328
	329	zz_pX f;
	330	random(f, n);
	331	SetCoeff(f, n);
	332	zz_pXModulus F(f);
	333
	334	zz_pX a, q, r, q1, r1;
	335
	336	random(a, 2*n-1);
	337
	338	DivRem(q, r, a, F);
	339	rem(r1, a, F);
	340	div(q1, a, F);
	341
	342	if (deg(r) >= n \|\| a != q*f + r \|\| q != q1 \|\| r != r1) {
	343	cerr << "******** oops\n";
	344	break;
	345	}
	346	}
	347
	348	cerr << "\n";
	349	}
	350
	351	int main()
	352	{
	353	long p;
	354	p = GenPrime_long(NTL_SP_NBITS);
	355
	356	zz_p::init(p);
	357
	358	multest();
	359	sqrtest();
	360	mulmodtest();
	361	sqrmodtest();
	362	mulmod1test();
	363	divremtest();
	364	updatetest();
	365
	366	zz_p::FFTInit(0);
	367
	368	cerr << "FFT Prime\n";
	369
	370	multest();
	371	sqrtest();
	372	mulmodtest();
	373	sqrmodtest();
	374	mulmod1test();
	375	divremtest();
	376	updatetest();
	377
	378	}
	379

-567

~~src/makefile~~ less more

0		###############################################################
1		#
2		# First, choose a C++ compiler, and set compiler flags.
3		# This is done by setting the variables CXX and CXXFLAGS.
4		#
5		###############################################################
6
7
8
9		CXX=g++
10		# A C++ compiler, e.g., g++, CC, xlC
11
12
13		CXXFLAGS=-g -O2
14		# Flags for the C++ compiler
15
16		CXXAUTOFLAGS= -std=c++11 -pthread -march=native
17		# Flags for the C++ compiler, automatically generated by configuration script
18
19		NOCONTRACT=
20
21
22		AR=ar
23		# command to make a library
24
25		ARFLAGS=ruv
26		# arguments for AR
27
28		RANLIB=ranlib
29		# set to echo if you want to disable it completely
30
31		LDFLAGS=
32		# libraries for linking C++ programs
33
34		LDLIBS=-lm
35		# libraries for linking C++ programs
36
37		CPPFLAGS=
38		# arguments for the C preprocessor
39
40		LIBTOOL=libtool
41		# libtool command -- this is now built locally
42
43		LIBTOOL_LINK_FLAGS=
44		# flags to add to command line when building a shared library
45		# mainly used to pass the argument "-no-undefined" on cygwin
46
47		DEF_PREFIX=/usr/local
48
49		PREFIX=$(DEF_PREFIX)
50		LIBDIR=$(PREFIX)/lib
51		INCLUDEDIR=$(PREFIX)/include
52		DOCDIR=$(PREFIX)/share/doc
53		# where to install NTL
54
55		DESTDIR=
56		# added to support standard package building techniques
57		# that install into a "staging area"
58
59		###############################################################
60		#
61		# Second, if you want to use GMP (the GNU Multi-Precision library),
62		# define the variables GMP_OPT_INCDIR, GMP_OPT_LIBDIR, GMP_OPT_LIB below.
63		# You also will have to set either NTL_GMP_LIP or NTL_GMP_HACK
64		# in the config.h file.
65		#
66		# Using GMP can lead to significant performance gains on some
67		# platforms. You can obtain GMP from http://www.swox.com/gmp.
68		# Once you unpack it into a directory, just execute
69		# ./configure; make
70		# in that directory.
71		#
72		###############################################################
73
74
75		GMP_PREFIX=$(DEF_PREFIX)
76
77		GMP_INCDIR=$(GMP_PREFIX)/include
78		# directory containing gmp.h if using GMP
79
80		GMP_LIBDIR=$(GMP_PREFIX)/lib
81		# directory containing libgmp.a if using GMP
82
83		GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
84		GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
85		GMP_OPT_LIB=-lgmp # GMP
86		# uncomment these if using GMP
87
88
89		###############################################################
90		#
91		# Third, if you want to use gf2x (a library for fast
92		# multiplication over GF(2)[X]), you need to
93		# define the variables GF2X_OPT_INCDIR, GF2X_OPT_LIBDIR, GF2X_OPT_LIB below.
94		# You also will have to set NTL_GF2X_LIB
95		# in the config.h file.
96		#
97		###############################################################
98
99		GF2X_PREFIX=$(DEF_PREFIX)
100
101		GF2X_INCDIR=$(GF2X_PREFIX)/include
102		# directory containing gf2x.h if using gf2x
103
104		GF2X_LIBDIR=$(GF2X_PREFIX)/lib
105		# directory containing libgf2x.a
106
107		GF2X_OPT_INCDIR=# -I$(GF2X_INCDIR) # GF2X
108		GF2X_OPT_LIBDIR=# -L$(GF2X_LIBDIR) # GF2X
109		GF2X_OPT_LIB=# -lgf2x # GF2X
110		# uncomment these if using gf2x
111
112
113		###############################################################
114		#
115		# Fourth, if you do not want to run the wizard that automagically
116		# sets some performace related flags in config.h, set the flag below.
117		#
118		###############################################################
119
120
121		WIZARD=off
122		# Set to off if you want to bypass the wizard; otherwise, set to on.
123
124
125		#################################################################
126		#
127		# That's it! You can ignore everything else in this file!
128		#
129		#################################################################
130
131
132		# object files
133		OBJ=FFT.o FacVec.o GF2.o GF2E.o GF2EX.o GF2EXFactoring.o GF2X.o GF2X1.o \
134		GF2XFactoring.o GF2XVec.o GetTime.o GetPID.o HNF.o ctools.o LLL.o LLL_FP.o \
135		LLL_QP.o LLL_RR.o LLL_XD.o RR.o WordVector.o ZZ.o ZZVec.o ZZX.o ZZX1.o \
136		ZZXCharPoly.o ZZXFactoring.o ZZ_p.o ZZ_pE.o ZZ_pEX.o ZZ_pEXFactoring.o ZZ_pX.o \
137		ZZ_pX1.o ZZ_pXCharPoly.o ZZ_pXFactoring.o fileio.o lip.o lzz_p.o lzz_pE.o \
138		lzz_pEX.o lzz_pEXFactoring.o lzz_pX.o lzz_pX1.o lzz_pXCharPoly.o \
139		lzz_pXFactoring.o mat_GF2.o mat_GF2E.o mat_RR.o mat_ZZ.o mat_ZZ_p.o mat_ZZ_pE.o \
140		mat_lzz_p.o mat_lzz_pE.o mat_poly_ZZ.o mat_poly_ZZ_p.o mat_poly_lzz_p.o \
141		quad_float.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
142		vec_ZZ_pE.o vec_lzz_p.o vec_lzz_pE.o xdouble.o G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o \
143		G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o
144
145		# library source files
146		SRC=FFT.cpp FacVec.cpp GF2.cpp GF2E.cpp GF2EX.cpp GF2EXFactoring.cpp GF2X.cpp \
147		GF2X1.cpp GF2XFactoring.cpp GF2XVec.cpp HNF.cpp ctools.cpp LLL.cpp LLL_FP.cpp \
148		LLL_QP.cpp LLL_RR.cpp LLL_XD.cpp RR.cpp WordVector.cpp ZZ.cpp ZZVec.cpp ZZX.cpp \
149		ZZX1.cpp ZZXCharPoly.cpp ZZXFactoring.cpp ZZ_p.cpp ZZ_pE.cpp ZZ_pEX.cpp \
150		ZZ_pEXFactoring.cpp ZZ_pX.cpp ZZ_pX1.cpp ZZ_pXCharPoly.cpp ZZ_pXFactoring.cpp \
151		fileio.cpp lip.cpp lzz_p.cpp lzz_pE.cpp lzz_pEX.cpp lzz_pEXFactoring.cpp \
152		lzz_pX.cpp lzz_pX1.cpp lzz_pXCharPoly.cpp lzz_pXFactoring.cpp mat_GF2.cpp \
153		mat_GF2E.cpp mat_RR.cpp mat_ZZ.cpp mat_ZZ_p.cpp mat_ZZ_pE.cpp mat_lzz_p.cpp \
154		mat_lzz_pE.cpp mat_poly_ZZ.cpp mat_poly_ZZ_p.cpp mat_poly_lzz_p.cpp \
155		quad_float.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
156		vec_ZZ_p.cpp vec_ZZ_pE.cpp vec_lzz_p.cpp vec_lzz_pE.cpp xdouble.cpp \
157		G_LLL_FP.cpp G_LLL_QP.cpp G_LLL_XD.cpp G_LLL_RR.cpp thread.cpp \
158		BasicThreadPool.cpp MatPrime.cpp
159
160
161
162		# library header files
163		INCL=FFT.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
164		GF2XFactoring.h GF2XVec.h HNF.h ctools.h LLL.h RR.h WordVector.h \
165		ZZ.h ZZ_limbs.h sp_arith.h ZZVec.h ZZX.h ZZXFactoring.h ZZ_p.h ZZ_pE.h ZZ_pEX.h \
166		ZZ_pEXFactoring.h ZZ_pX.h ZZ_pXFactoring.h fileio.h lip.h lzz_p.h lzz_pE.h \
167		lzz_pEX.h lzz_pEXFactoring.h lzz_pX.h lzz_pXFactoring.h mat_GF2.h mat_GF2E.h \
168		mat_RR.h mat_ZZ.h mat_ZZ_p.h mat_ZZ_pE.h mat_lzz_p.h mat_lzz_pE.h mat_poly_ZZ.h \
169		mat_poly_ZZ_p.h mat_poly_lzz_p.h matrix.h pair.h vector.h pair_GF2EX_long.h \
170		pair_GF2X_long.h pair_ZZX_long.h pair_ZZ_pEX_long.h pair_ZZ_pX_long.h \
171		pair_lzz_pEX_long.h pair_lzz_pX_long.h quad_float.h tools.h vec_GF2.h \
172		vec_GF2E.h vec_GF2XVec.h vec_RR.h vec_ZZ.h vec_ZZVec.h vec_ZZ_p.h vec_ZZ_pE.h \
173		vec_double.h vec_long.h vec_lzz_p.h vec_lzz_pE.h vec_quad_float.h vec_vec_GF2.h \
174		vec_vec_GF2E.h vec_vec_RR.h vec_vec_ZZ.h vec_vec_ZZ_p.h vec_vec_ZZ_pE.h \
175		vec_vec_long.h vec_vec_lzz_p.h vec_vec_lzz_pE.h vec_xdouble.h xdouble.h \
176		config.h version.h new.h vec_ulong.h vec_vec_ulong.h SmartPtr.h \
177		Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h
178
179
180
181		# test data
182		TD=BerlekampTestIn BerlekampTestOut CanZassTestIn CanZassTestOut \
183		ZZXFacTestIn ZZXFacTestOut MoreFacTestIn LLLTestIn LLLTestOut RRTestIn RRTestOut \
184		MatrixTestIn MatrixTestOut CharPolyTestIn \
185		CharPolyTestOut QuadTestIn QuadTestOut
186
187
188		# test source files
189		TS=QuickTest.cpp ZZTest.cpp BerlekampTest.cpp CanZassTest.cpp ZZXFacTest.cpp \
190		MoreFacTest.cpp LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
191		CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp \
192		BitMatTest.cpp ZZ_pEXTest.cpp lzz_pEXTest.cpp Timing.cpp ThreadTest.cpp \
193		ExceptionTest.cpp
194
195		# scripts
196		SCRIPTS=MakeGetTime MakeGetPID MakeCheckFeatures ResetFeatures CopyFeatures \
197		TestScript dosify unixify RemoveProg configure DoConfig mfile cfile ppscript
198
199
200		# auxilliary source
201		MD=MakeDesc.cpp MakeDescAux.cpp newnames.cpp gen_gmp_aux.cpp gf2x_version_1_2_or_later_required.cpp
202		GT=GetTime0.cpp GetTime1.cpp GetTime2.cpp GetTime3.cpp GetTime4.cpp GetTime5.cpp TestGetTime.cpp
203		GP=GetPID1.cpp GetPID2.cpp TestGetPID.cpp
204		CH=CheckCompile.cpp GenConfigInfo.cpp CheckContract.cpp CheckContractAux.cpp \
205		CheckThreads.cpp
206
207		AUXPROGS = TestGetTime TestGetPID CheckFeatures CheckCompile GenConfigInfo CheckContract \
208		CheckThreads
209
210		FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA \
211		COPY_TRAITS1 COPY_TRAITS2 CHRONO_TIME MACOS_TIME POSIX_TIME
212
213
214		# documentation
215
216
217		DFILES=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt \
218		GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt \
219		LazyTable.txt LLL.txt RR.txt SmartPtr.txt ZZ.txt ZZ_limbs.txt ZZVec.txt ZZX.txt \
220		ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt \
221		ZZ_pXFactoring.txt conversions.txt flags.txt lzz_p.txt lzz_pE.txt lzz_pEX.txt \
222		lzz_pEXFactoring.txt lzz_pX.txt lzz_pXFactoring.txt mat_GF2.txt mat_GF2E.txt \
223		mat_RR.txt mat_ZZ.txt mat_ZZ_p.txt mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt \
224		mat_poly_ZZ.txt mat_poly_ZZ_p.txt mat_poly_lzz_p.txt matrix.txt pair.txt \
225		vector.txt quad_float.txt sedscript.txt tools.txt vec_GF2.txt vec_GF2E.txt \
226		vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt vec_lzz_pE.txt \
227		xdouble.txt names.txt tour-ack.html tour-intro.html tour-time.html \
228		tour-changes.html tour-modules.html tour-unix.html tour-examples.html \
229		tour-roadmap.html tour-win.html tour-impl.html tour-struct.html tour.html \
230		tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html tour-ex5.html \
231		tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif tour-gmp.html \
232		tour-gf2x.html tour-tips.html config.txt version.txt
233
234
235
236		TXFILES=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt \
237		GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt \
238		SmartPtr.txt ZZ.txt ZZ_limbs.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt \
239		ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt lzz_p.txt \
240		lzz_pE.txt lzz_pEX.txt lzz_pEXFactoring.txt lzz_pX.txt lzz_pXFactoring.txt \
241		mat_GF2.txt mat_GF2E.txt mat_RR.txt mat_ZZ.txt mat_ZZ_p.txt mat_ZZ_pE.txt \
242		mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt \
243		mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt \
244		vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt \
245		vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
246
247
248		HTFILES=GF2.cpp.html GF2E.cpp.html GF2EX.cpp.html GF2EXFactoring.cpp.html \
249		GF2X.cpp.html GF2XFactoring.cpp.html GF2XVec.cpp.html HNF.cpp.html \
250		Lazy.cpp.html LazyTable.cpp.html LLL.cpp.html RR.cpp.html SmartPtr.cpp.html \
251		ZZ.cpp.html ZZ_limbs.cpp.html ZZVec.cpp.html ZZX.cpp.html ZZXFactoring.cpp.html ZZ_p.cpp.html \
252		ZZ_pE.cpp.html ZZ_pEX.cpp.html ZZ_pEXFactoring.cpp.html ZZ_pX.cpp.html \
253		ZZ_pXFactoring.cpp.html lzz_p.cpp.html lzz_pE.cpp.html lzz_pEX.cpp.html \
254		lzz_pEXFactoring.cpp.html lzz_pX.cpp.html lzz_pXFactoring.cpp.html \
255		mat_GF2.cpp.html mat_GF2E.cpp.html mat_RR.cpp.html mat_ZZ.cpp.html \
256		mat_ZZ_p.cpp.html mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html \
257		mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html mat_poly_lzz_p.cpp.html \
258		matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html \
259		vec_GF2.cpp.html vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html \
260		vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html vec_lzz_pE.cpp.html \
261		vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
262
263
264
265		DOC = $(DFILES) $(HTFILES)
266
267
268		# test program executables
269		PROGS=QuickTest ZZTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest \
270		BitMatTest MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest GF2XTest \
271		GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
272
273		# things to save to a tar file
274		SFILES=makefile $(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win \
275		Poly1TimeTest.cpp Poly2TimeTest.cpp Poly3TimeTest.cpp GF2XTimeTest.cpp \
276		InitSettings.cpp DispSettings.cpp WizardAux Wizard
277
278
279		#################################################################
280		#
281		# Rules for compiling the library
282		#
283		#################################################################
284
285
286		NTL_INCLUDE = -I../include -I.
287		# NTL needs this to find its include files
288
289		COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(CXXAUTOFLAGS) -c
290
291		LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(CXXAUTOFLAGS) $(LDFLAGS)
292
293
294
295		# 'make' or 'make all' does a complete make, including additional
296		# setup not done in configure.
297
298		# The file setup-phase is removed by the configure script
299		# when it terminates successfully.
300
301		# The file need-to-run-configure is removed by the configure script
302		# before making any changes to makefile/config.h and is recreated
303		# when it terminates successfully.
304
305		all: setup-phase
306		$(MAKE) ntl.a
307
308		setup-phase: need-to-run-configure
309		$(MAKE) clobber
310		$(MAKE) setup1
311		$(MAKE) setup2
312		$(MAKE) setup3
313		$(MAKE) setup4
314		touch setup-phase
315
316
317		# setup1 generates the file ../incluse/NTL/mach_desc.h
318
319		setup1:
320		$(COMPILE) MakeDescAux.cpp
321		$(LINK) -o MakeDesc MakeDesc.cpp MakeDescAux.o $(LDLIBS)
322		./MakeDesc
323		mv mach_desc.h ../include/NTL/mach_desc.h
324
325
326		# setup2 does some dynamic checks for GetTime, GetPID, and other features
327
328		setup2:
329		echo "* CheckFeatures log *" > CheckFeatures.log
330		sh MakeGetTime "$(LINK)" "$(LDLIBS)"
331		sh MakeGetPID "$(LINK)" "$(LDLIBS)"
332		sh MakeCheckFeatures "$(FEATURES)" "$(LINK)" "$(LDLIBS)"
333
334		# NOTE: to add a feature XXX:
335		# * add a program CheckXXX.cpp which returns 0 if XXX works, -1 otherwise
336		# * add XXX to the FEATURES variable
337
338		# setup3 generates the file ../include/NTL/gmp_aux.h
339		# The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h
340		# when NTL_GMP_LIP is set.
341		# When this flag is not set, an empty files produced.
342		# This also checks that the right version of gf2x library.
343
344		setup3:
345		$(LINK) $(GMP_OPT_INCDIR) -o gen_gmp_aux gen_gmp_aux.cpp $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
346		./gen_gmp_aux > ../include/NTL/gmp_aux.h
347		$(LINK) $(GF2X_OPT_INCDIR) -o gf2x_version_1_2_or_later_required gf2x_version_1_2_or_later_required.cpp $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS)
348
349		# setup4 runs the wizard
350
351		setup4:
352		sh Wizard $(WIZARD) "$(MAKE)" "$(FEATURES)"
353
354
355		ntl.a: $(OBJ)
356		$(AR) $(ARFLAGS) ntl.a $(OBJ) #LSTAT
357		- $(RANLIB) ntl.a #LSTAT
358		# $(LIBTOOL) --tag=CXX --mode=link $(LINK) $(LIBTOOL_LINK_FLAGS) -o libntl.la $(OBJ:.o=.lo) $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS) -rpath $(LIBDIR) -version-info `cat VERSION_INFO` #LSHAR
359
360		LCOMP= #LSTAT
361		# LCOMP=$(LIBTOOL) --tag=CXX --mode=compile #LSHAR
362
363		lip.o: lip.cpp
364		$(LCOMP) $(COMPILE) $(GMP_OPT_INCDIR) lip.cpp
365
366		GF2X.o: GF2X.cpp
367		$(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) GF2X.cpp
368
369		quad_float.o: quad_float.cpp
370		$(LCOMP) $(COMPILE) $(NOCONTRACT) quad_float.cpp
371
372		CheckCompile: CheckCompile.cpp
373		$(LINK) -o CheckCompile CheckCompile.cpp $(LDLIBS)
374
375		GenConfigInfo: GenConfigInfo.cpp
376		$(LINK) -o GenConfigInfo GenConfigInfo.cpp $(LDLIBS)
377
378		CheckContract: CheckContract.cpp CheckContractAux.cpp
379		$(LINK) $(NOCONTRACT) -o CheckContract CheckContract.cpp CheckContractAux.cpp $(LDLIBS)
380
381		CheckThreads: CheckThreads.cpp
382		$(LINK) -o CheckThreads CheckThreads.cpp $(LDLIBS)
383
384
385		.cpp.o:
386		$(LCOMP) $(COMPILE) $<
387
388		.cpp:
389		$(LINK) -o $@ $< ntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS) #LSTAT
390		# $(LIBTOOL) --tag=CXX --mode=link $(LINK) -o $@ $< libntl.la #LSHAR
391
392		#################################################################
393		#
394		# Rule for running tests
395		# make check runs a series of tests
396		#
397		#################################################################
398
399		check:
400		sh RemoveProg $(PROGS)
401		$(MAKE) QuickTest
402		./QuickTest
403		sh RemoveProg QuickTest
404		sh TestScript "$(MAKE)"
405
406		#################################################################
407		#
408		# Rule for installing
409		# make install just does a simple copy of the include file
410		# and library. The -p option is used to preserve file attributes.
411		# This helps avoid some problems (especially when copying ntl.a).
412		# Also, an attempt is made to make everything that is
413		# installed readable by everyone.
414		#
415		# make uninstall removes these files
416		#
417		#################################################################
418
419
420
421
422		install:
423		mkdir -p -m 755 $(DESTDIR)$(INCLUDEDIR)
424		rm -rf $(DESTDIR)$(INCLUDEDIR)/NTL
425		mkdir -m 755 $(DESTDIR)$(INCLUDEDIR)/NTL
426		cp -p ../include/NTL/*.h $(DESTDIR)$(INCLUDEDIR)/NTL
427		- chmod -R a+r $(DESTDIR)$(INCLUDEDIR)/NTL
428		mkdir -p -m 755 $(DESTDIR)$(DOCDIR)
429		rm -rf $(DESTDIR)$(DOCDIR)/NTL
430		mkdir -m 755 $(DESTDIR)$(DOCDIR)/NTL
431		cp -p ../doc/*.txt $(DESTDIR)$(DOCDIR)/NTL
432		cp -p ../doc/*.html $(DESTDIR)$(DOCDIR)/NTL
433		cp -p ../doc/*.gif $(DESTDIR)$(DOCDIR)/NTL
434		- chmod -R a+r $(DESTDIR)$(DOCDIR)/NTL
435		mkdir -p -m 755 $(DESTDIR)$(LIBDIR)
436		cp -p ntl.a $(DESTDIR)$(LIBDIR)/libntl.a #LSTAT
437		- chmod a+r $(DESTDIR)$(LIBDIR)/libntl.a #LSTAT
438		# $(LIBTOOL) --mode=install cp -p libntl.la $(DESTDIR)$(LIBDIR) #LSHAR
439
440
441		uninstall:
442		rm -f $(LIBDIR)/libntl.a #LSTAT
443		# $(LIBTOOL) --mode=uninstall rm -f $(LIBDIR)/libntl.la #LSHAR
444		rm -rf $(INCLUDEDIR)/NTL
445		rm -rf $(DOCDIR)/NTL
446
447		#################################################################
448		#
449		# Rules for cleaning up
450		#
451		# make clobber removes everything created by make,
452		# but it does not restore config.h to its default.
453		#
454		# make clean tidies up a bit
455		#
456		#################################################################
457
458		clobber:
459		rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.cpp GetPID.cpp
460		sh ResetFeatures '..' "$(FEATURES)"
461		rm -f ../include/NTL/gmp_aux.h
462		sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux gf2x_version_1_2_or_later_required
463		rm -f *.o
464		rm -rf small
465		rm -f cfileout mfileout
466		rm -rf .libs *.lo libntl.la
467		rm -f setup-phase
468
469		clean:
470		sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux gf2x_version_1_2_or_later_required
471		rm -f *.o
472		rm -rf small
473		# - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR
474
475		wclean:
476		rm -f *.o
477
478		#################################################################
479		#
480		# Rules for making tar and zip files
481		#
482		# make libtool-gen-origin generates the directory
483		# libtool-origin used to include in the distribution
484		# - this only needs to be run very occasionally, to keep
485		# libtool relatively up-to-date
486		# - it must be run on a machine with autotools
487		#
488		# make ppdoc creates pretty-printed versions of some documentation
489		# - run before make package or make winpack
490		#
491		# make package creates a tar.gz file suitable for Unix
492		#
493		# make winpack creates a zip file suitable for Windows
494		#
495		#################################################################
496
497		libtool-gen-origin:
498		rm -rf libtool-origin && \
499		cp -R libtool-seed libtool-origin && \
500		cd libtool-origin && autoreconf -fiv && rm -rf autom4te.cache
501
502		ppdoc:
503		sh ppscript "$(TXFILES)"
504
505		ppclean:
506		rm -f ../doc/*.cpp
507
508
509		package:
510		sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)" "$(FEATURES)"
511		rm -rf `cat DIRNAME`
512		rm -f `cat DIRNAME`.tar
513		rm -f `cat DIRNAME`.tar.gz
514		mv unix `cat DIRNAME`
515		chmod -R a+rX `cat DIRNAME`
516		tar -cvf `cat DIRNAME`.tar `cat DIRNAME`
517		gzip `cat DIRNAME`.tar
518		rm -rf `cat DIRNAME`
519
520		winpack:
521		./configure --nowrite NTL_GMP_LIP=off NTL_TLS_HACK=off
522		sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(FEATURES)"
523		rm -rf `cat WINDIR`
524		rm -f `cat WINDIR`.zip
525		mv dos `cat WINDIR`
526		chmod -R a+rX `cat WINDIR`
527		find ./`cat WINDIR` '!' -name '*.gif' -print \| zip -l `cat WINDIR` -@
528		find ./`cat WINDIR` -name '*.gif' -print \| zip -u `cat WINDIR` -@
529		rm -rf `cat WINDIR`
530
531
532		######################################################################
533		#
534		# config wizard related stuff
535		#
536		######################################################################
537
538		WOBJ=FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o ZZ_pX1.o \
539		lip.o tools.o vec_ZZ.o vec_ZZ_p.o GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o \
540		thread.o BasicThreadPool.o fileio.o
541
542
543		# wntl.a: LCOMP= #LSHAR
544		wntl.a: $(WOBJ)
545		$(AR) $(ARFLAGS) wntl.a $(WOBJ)
546		- $(RANLIB) wntl.a
547
548
549		Poly1TimeTest:
550		$(LINK) -o Poly1TimeTest Poly1TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
551		Poly2TimeTest:
552		$(LINK) -o Poly2TimeTest Poly2TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
553		Poly3TimeTest:
554		$(LINK) -o Poly3TimeTest Poly3TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
555
556
557		GF2XTimeTest:
558		$(LINK) -o GF2XTimeTest GF2XTimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
559
560		InitSettings:
561		$(LINK) -o InitSettings InitSettings.cpp $(LDLIBS)
562
563
564		DispSettings:
565		$(LINK) -o DispSettings DispSettings.cpp $(LDLIBS)
566

+593

-302

src/mat_lzz_p.cpp less more

619	619	//
620	620	// ******************************************************************
621	621
	622	//#undef NTL_HAVE_AVX
	623	//#undef NTL_HAVE_FMA
	624	//#undef NTL_HAVE_AVX512F
	625	// for testing purposes
	626
	627	#if (defined(NTL_HAVE_AVX512F) && defined(NTL_AVOID_AVX512))
	628	#undef NTL_HAVE_AVX512F
	629	#endif
	630
622	631	#define MAT_BLK_SZ (32)
623	632
624	633

643	652	#else
644	653	#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
645	654	#endif
	655
	656
	657	#ifdef NTL_HAVE_AVX512F
	658	#define MUL_ADD512(a, b, c) a = _mm512_fmadd_pd(b, c, a)
	659	#endif
	660
	661
	662
	663	#ifdef NTL_HAVE_AVX512F
	664
	665	static
	666	void muladd1_by_32(double x, const double a, const double *b, long n)
	667	{
	668	__m512d avec0, bvec;
	669
	670	__m512d acc00, acc01, acc02, acc03;
	671
	672	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	673	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	674	acc02=_mm512_load_pd(x + 28 + 0MAT_BLK_SZ);
	675	acc03=_mm512_load_pd(x + 38 + 0MAT_BLK_SZ);
	676
	677	for (long i = 0; i < n; i++) {
	678	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	679
	680	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	681	MUL_ADD512(acc00, avec0, bvec);
	682
	683	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	684	MUL_ADD512(acc01, avec0, bvec);
	685
	686	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+28]);
	687	MUL_ADD512(acc02, avec0, bvec);
	688
	689	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+38]);
	690	MUL_ADD512(acc03, avec0, bvec);
	691	}
	692
	693
	694	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	695	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	696	_mm512_store_pd(x + 28 + 0MAT_BLK_SZ, acc02);
	697	_mm512_store_pd(x + 38 + 0MAT_BLK_SZ, acc03);
	698
	699	}
	700
	701	static
	702	void muladd2_by_32(double x, const double a, const double *b, long n)
	703	{
	704	__m512d avec0, avec1, bvec;
	705
	706	__m512d acc00, acc01, acc02, acc03;
	707	__m512d acc10, acc11, acc12, acc13;
	708
	709
	710
	711	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	712	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	713	acc02=_mm512_load_pd(x + 28 + 0MAT_BLK_SZ);
	714	acc03=_mm512_load_pd(x + 38 + 0MAT_BLK_SZ);
	715
	716	acc10=_mm512_load_pd(x + 08 + 1MAT_BLK_SZ);
	717	acc11=_mm512_load_pd(x + 18 + 1MAT_BLK_SZ);
	718	acc12=_mm512_load_pd(x + 28 + 1MAT_BLK_SZ);
	719	acc13=_mm512_load_pd(x + 38 + 1MAT_BLK_SZ);
	720
	721	for (long i = 0; i < n; i++) {
	722	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	723	avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
	724
	725	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	726	MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
	727
	728	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	729	MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
	730
	731	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+28]);
	732	MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
	733
	734	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+38]);
	735	MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
	736	}
	737
	738
	739	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	740	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	741	_mm512_store_pd(x + 28 + 0MAT_BLK_SZ, acc02);
	742	_mm512_store_pd(x + 38 + 0MAT_BLK_SZ, acc03);
	743
	744	_mm512_store_pd(x + 08 + 1MAT_BLK_SZ, acc10);
	745	_mm512_store_pd(x + 18 + 1MAT_BLK_SZ, acc11);
	746	_mm512_store_pd(x + 28 + 1MAT_BLK_SZ, acc12);
	747	_mm512_store_pd(x + 38 + 1MAT_BLK_SZ, acc13);
	748
	749	}
	750
	751
	752	static
	753	void muladd3_by_32(double x, const double a, const double *b, long n)
	754	{
	755	__m512d avec0, avec1, avec2, bvec;
	756
	757	__m512d acc00, acc01, acc02, acc03;
	758	__m512d acc10, acc11, acc12, acc13;
	759	__m512d acc20, acc21, acc22, acc23;
	760
	761
	762
	763	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	764	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	765	acc02=_mm512_load_pd(x + 28 + 0MAT_BLK_SZ);
	766	acc03=_mm512_load_pd(x + 38 + 0MAT_BLK_SZ);
	767
	768	acc10=_mm512_load_pd(x + 08 + 1MAT_BLK_SZ);
	769	acc11=_mm512_load_pd(x + 18 + 1MAT_BLK_SZ);
	770	acc12=_mm512_load_pd(x + 28 + 1MAT_BLK_SZ);
	771	acc13=_mm512_load_pd(x + 38 + 1MAT_BLK_SZ);
	772
	773	acc20=_mm512_load_pd(x + 08 + 2MAT_BLK_SZ);
	774	acc21=_mm512_load_pd(x + 18 + 2MAT_BLK_SZ);
	775	acc22=_mm512_load_pd(x + 28 + 2MAT_BLK_SZ);
	776	acc23=_mm512_load_pd(x + 38 + 2MAT_BLK_SZ);
	777
	778	for (long i = 0; i < n; i++) {
	779	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	780	avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
	781	avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
	782
	783	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	784	MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
	785	MUL_ADD512(acc20, avec2, bvec);
	786
	787	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	788	MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
	789	MUL_ADD512(acc21, avec2, bvec);
	790
	791	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+28]);
	792	MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
	793	MUL_ADD512(acc22, avec2, bvec);
	794
	795	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+38]);
	796	MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
	797	MUL_ADD512(acc23, avec2, bvec);
	798	}
	799
	800	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	801	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	802	_mm512_store_pd(x + 28 + 0MAT_BLK_SZ, acc02);
	803	_mm512_store_pd(x + 38 + 0MAT_BLK_SZ, acc03);
	804
	805	_mm512_store_pd(x + 08 + 1MAT_BLK_SZ, acc10);
	806	_mm512_store_pd(x + 18 + 1MAT_BLK_SZ, acc11);
	807	_mm512_store_pd(x + 28 + 1MAT_BLK_SZ, acc12);
	808	_mm512_store_pd(x + 38 + 1MAT_BLK_SZ, acc13);
	809
	810	_mm512_store_pd(x + 08 + 2MAT_BLK_SZ, acc20);
	811	_mm512_store_pd(x + 18 + 2MAT_BLK_SZ, acc21);
	812	_mm512_store_pd(x + 28 + 2MAT_BLK_SZ, acc22);
	813	_mm512_store_pd(x + 38 + 2MAT_BLK_SZ, acc23);
	814
	815
	816	}
	817
	818
	819	static
	820	void muladd1_by_16(double x, const double a, const double *b, long n)
	821	{
	822	__m512d avec0, bvec;
	823
	824	__m512d acc00, acc01;
	825
	826
	827
	828	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	829	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	830
	831	for (long i = 0; i < n; i++) {
	832	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	833
	834	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	835	MUL_ADD512(acc00, avec0, bvec);
	836
	837	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	838	MUL_ADD512(acc01, avec0, bvec);
	839	}
	840
	841
	842	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	843	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	844
	845	}
	846
	847	static
	848	void muladd2_by_16(double x, const double a, const double *b, long n)
	849	{
	850	__m512d avec0, avec1, bvec;
	851
	852	__m512d acc00, acc01;
	853	__m512d acc10, acc11;
	854
	855
	856
	857	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	858	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	859
	860	acc10=_mm512_load_pd(x + 08 + 1MAT_BLK_SZ);
	861	acc11=_mm512_load_pd(x + 18 + 1MAT_BLK_SZ);
	862
	863	for (long i = 0; i < n; i++) {
	864	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	865	avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
	866
	867	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	868	MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
	869
	870	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	871	MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
	872	}
	873
	874
	875	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	876	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	877
	878	_mm512_store_pd(x + 08 + 1MAT_BLK_SZ, acc10);
	879	_mm512_store_pd(x + 18 + 1MAT_BLK_SZ, acc11);
	880	}
	881
	882
	883	static
	884	void muladd3_by_16(double x, const double a, const double *b, long n)
	885	{
	886	__m512d avec0, avec1, avec2, bvec;
	887
	888	__m512d acc00, acc01;
	889	__m512d acc10, acc11;
	890	__m512d acc20, acc21;
	891
	892
	893
	894	acc00=_mm512_load_pd(x + 08 + 0MAT_BLK_SZ);
	895	acc01=_mm512_load_pd(x + 18 + 0MAT_BLK_SZ);
	896
	897	acc10=_mm512_load_pd(x + 08 + 1MAT_BLK_SZ);
	898	acc11=_mm512_load_pd(x + 18 + 1MAT_BLK_SZ);
	899
	900	acc20=_mm512_load_pd(x + 08 + 2MAT_BLK_SZ);
	901	acc21=_mm512_load_pd(x + 18 + 2MAT_BLK_SZ);
	902
	903
	904	for (long i = 0; i < n; i++) {
	905	avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
	906	avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
	907	avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
	908
	909	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+08]);
	910	MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
	911	MUL_ADD512(acc20, avec2, bvec);
	912
	913	bvec = _mm512_load_pd(&b[iMAT_BLK_SZ+18]);
	914	MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
	915	MUL_ADD512(acc21, avec2, bvec);
	916	}
	917
	918
	919	_mm512_store_pd(x + 08 + 0MAT_BLK_SZ, acc00);
	920	_mm512_store_pd(x + 18 + 0MAT_BLK_SZ, acc01);
	921
	922	_mm512_store_pd(x + 08 + 1MAT_BLK_SZ, acc10);
	923	_mm512_store_pd(x + 18 + 1MAT_BLK_SZ, acc11);
	924
	925	_mm512_store_pd(x + 08 + 2MAT_BLK_SZ, acc20);
	926	_mm512_store_pd(x + 18 + 2MAT_BLK_SZ, acc21);
	927
	928	}
	929
	930
	931
	932	#else
646	933
647	934	static
648	935	void muladd1_by_32(double x, const double a, const double *b, long n)

686	973	}
687	974
688	975	static
	976	void muladd2_by_32(double x, const double a, const double *b, long n)
	977	{
	978	__m256d avec0, avec1, bvec;
	979	__m256d acc00, acc01, acc02, acc03;
	980	__m256d acc10, acc11, acc12, acc13;
	981
	982
	983	// round 0
	984
	985	acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
	986	acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
	987	acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
	988	acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
	989
	990	acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
	991	acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
	992	acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
	993	acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
	994
	995	for (long i = 0; i < n; i++) {
	996	avec0 = _mm256_broadcast_sd(&a[i]);
	997	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	998
	999	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
	1000	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
	1001	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
	1002	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
	1003	}
	1004
	1005
	1006	_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
	1007	_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
	1008	_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
	1009	_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
	1010
	1011	_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
	1012	_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
	1013	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
	1014	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
	1015
	1016	// round 1
	1017
	1018	acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
	1019	acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
	1020	acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
	1021	acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
	1022
	1023	acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
	1024	acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
	1025	acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
	1026	acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
	1027
	1028	for (long i = 0; i < n; i++) {
	1029	avec0 = _mm256_broadcast_sd(&a[i]);
	1030	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	1031
	1032	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
	1033	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
	1034	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
	1035	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
	1036	}
	1037
	1038
	1039	_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
	1040	_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
	1041	_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
	1042	_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
	1043
	1044	_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
	1045	_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
	1046	_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
	1047	_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
	1048
	1049	}
	1050
	1051	// NOTE: this makes things slower on an AVX1 platform --- not enough registers
	1052	// it could be faster on AVX2/FMA, where there should be enough registers
	1053	static
	1054	void muladd3_by_32(double x, const double a, const double *b, long n)
	1055	{
	1056	__m256d avec0, avec1, avec2, bvec;
	1057	__m256d acc00, acc01, acc02, acc03;
	1058	__m256d acc10, acc11, acc12, acc13;
	1059	__m256d acc20, acc21, acc22, acc23;
	1060
	1061
	1062	// round 0
	1063
	1064	acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
	1065	acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
	1066	acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
	1067	acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
	1068
	1069	acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
	1070	acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
	1071	acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
	1072	acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
	1073
	1074	acc20=_mm256_load_pd(x + 04 + 2MAT_BLK_SZ);
	1075	acc21=_mm256_load_pd(x + 14 + 2MAT_BLK_SZ);
	1076	acc22=_mm256_load_pd(x + 24 + 2MAT_BLK_SZ);
	1077	acc23=_mm256_load_pd(x + 34 + 2MAT_BLK_SZ);
	1078
	1079	for (long i = 0; i < n; i++) {
	1080	avec0 = _mm256_broadcast_sd(&a[i]);
	1081	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	1082	avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
	1083
	1084	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
	1085	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
	1086	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
	1087	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
	1088	}
	1089
	1090
	1091	_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
	1092	_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
	1093	_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
	1094	_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
	1095
	1096	_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
	1097	_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
	1098	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
	1099	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
	1100
	1101	_mm256_store_pd(x + 04 + 2MAT_BLK_SZ, acc20);
	1102	_mm256_store_pd(x + 14 + 2MAT_BLK_SZ, acc21);
	1103	_mm256_store_pd(x + 24 + 2MAT_BLK_SZ, acc22);
	1104	_mm256_store_pd(x + 34 + 2MAT_BLK_SZ, acc23);
	1105
	1106	// round 1
	1107
	1108	acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
	1109	acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
	1110	acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
	1111	acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
	1112
	1113	acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
	1114	acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
	1115	acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
	1116	acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
	1117
	1118	acc20=_mm256_load_pd(x + 44 + 2MAT_BLK_SZ);
	1119	acc21=_mm256_load_pd(x + 54 + 2MAT_BLK_SZ);
	1120	acc22=_mm256_load_pd(x + 64 + 2MAT_BLK_SZ);
	1121	acc23=_mm256_load_pd(x + 74 + 2MAT_BLK_SZ);
	1122
	1123	for (long i = 0; i < n; i++) {
	1124	avec0 = _mm256_broadcast_sd(&a[i]);
	1125	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	1126	avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
	1127
	1128	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
	1129	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
	1130	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
	1131	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
	1132	}
	1133
	1134
	1135	_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
	1136	_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
	1137	_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
	1138	_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
	1139
	1140	_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
	1141	_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
	1142	_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
	1143	_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
	1144
	1145	_mm256_store_pd(x + 44 + 2MAT_BLK_SZ, acc20);
	1146	_mm256_store_pd(x + 54 + 2MAT_BLK_SZ, acc21);
	1147	_mm256_store_pd(x + 64 + 2MAT_BLK_SZ, acc22);
	1148	_mm256_store_pd(x + 74 + 2MAT_BLK_SZ, acc23);
	1149
	1150	}
	1151
	1152	static
689	1153	void muladd1_by_16(double x, const double a, const double *b, long n)
690	1154	{
691	1155	__m256d avec, bvec;

716	1180	}
717	1181
718	1182
719		// experiment: process two rows at a time
	1183
720	1184	static
721		void muladd2_by_32(double x, const double a, const double *b, long n)
	1185	void muladd2_by_16(double x, const double a, const double *b, long n)
722	1186	{
723	1187	__m256d avec0, avec1, bvec;
724	1188	__m256d acc00, acc01, acc02, acc03;

758	1222	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
759	1223	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
760	1224
761		// round 1
762
763		acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
764		acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
765		acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
766		acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
767
768		acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
769		acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
770		acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
771		acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
772
773		for (long i = 0; i < n; i++) {
774		avec0 = _mm256_broadcast_sd(&a[i]);
775		avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
776
777		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
778		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
779		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
780		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
781		}
782
783
784		_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
785		_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
786		_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
787		_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
788
789		_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
790		_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
791		_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
792		_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
793
794		}
795
796
797		// experiment: process three rows at a time
798		// NOTE: this makes things slower on an AVX1 platform --- not enough registers
799		// it could be faster on AVX2/FMA, where there should be enough registers
	1225	}
	1226
800	1227
801	1228	static
802		void muladd3_by_32(double x, const double a, const double *b, long n)
	1229	void muladd3_by_16(double x, const double a, const double *b, long n)
803	1230	{
804	1231	__m256d avec0, avec1, avec2, bvec;
805	1232	__m256d acc00, acc01, acc02, acc03;

851	1278	_mm256_store_pd(x + 24 + 2MAT_BLK_SZ, acc22);
852	1279	_mm256_store_pd(x + 34 + 2MAT_BLK_SZ, acc23);
853	1280
854		// round 1
855
856		acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
857		acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
858		acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
859		acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
860
861		acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
862		acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
863		acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
864		acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
865
866		acc20=_mm256_load_pd(x + 44 + 2MAT_BLK_SZ);
867		acc21=_mm256_load_pd(x + 54 + 2MAT_BLK_SZ);
868		acc22=_mm256_load_pd(x + 64 + 2MAT_BLK_SZ);
869		acc23=_mm256_load_pd(x + 74 + 2MAT_BLK_SZ);
870
871		for (long i = 0; i < n; i++) {
872		avec0 = _mm256_broadcast_sd(&a[i]);
873		avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
874		avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
875
876		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
877		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
878		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
879		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
880		}
881
882
883		_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
884		_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
885		_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
886		_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
887
888		_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
889		_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
890		_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
891		_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
892
893		_mm256_store_pd(x + 44 + 2MAT_BLK_SZ, acc20);
894		_mm256_store_pd(x + 54 + 2MAT_BLK_SZ, acc21);
895		_mm256_store_pd(x + 64 + 2MAT_BLK_SZ, acc22);
896		_mm256_store_pd(x + 74 + 2MAT_BLK_SZ, acc23);
897
898		}
899
900		static
901		void muladd2_by_16(double x, const double a, const double *b, long n)
902		{
903		__m256d avec0, avec1, bvec;
904		__m256d acc00, acc01, acc02, acc03;
905		__m256d acc10, acc11, acc12, acc13;
906
907
908		// round 0
909
910		acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
911		acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
912		acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
913		acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
914
915		acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
916		acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
917		acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
918		acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
919
920		for (long i = 0; i < n; i++) {
921		avec0 = _mm256_broadcast_sd(&a[i]);
922		avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
923
924		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
925		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
926		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
927		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
928		}
929
930
931		_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
932		_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
933		_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
934		_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
935
936		_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
937		_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
938		_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
939		_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
940
941		}
942
943		static
944		void muladd3_by_16(double x, const double a, const double *b, long n)
945		{
946		__m256d avec0, avec1, avec2, bvec;
947		__m256d acc00, acc01, acc02, acc03;
948		__m256d acc10, acc11, acc12, acc13;
949		__m256d acc20, acc21, acc22, acc23;
950
951
952		// round 0
953
954		acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
955		acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
956		acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
957		acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
958
959		acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
960		acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
961		acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
962		acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
963
964		acc20=_mm256_load_pd(x + 04 + 2MAT_BLK_SZ);
965		acc21=_mm256_load_pd(x + 14 + 2MAT_BLK_SZ);
966		acc22=_mm256_load_pd(x + 24 + 2MAT_BLK_SZ);
967		acc23=_mm256_load_pd(x + 34 + 2MAT_BLK_SZ);
968
969		for (long i = 0; i < n; i++) {
970		avec0 = _mm256_broadcast_sd(&a[i]);
971		avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
972		avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
973
974		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
975		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
976		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
977		bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
978		}
979
980
981		_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
982		_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
983		_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
984		_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
985
986		_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
987		_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
988		_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
989		_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
990
991		_mm256_store_pd(x + 04 + 2MAT_BLK_SZ, acc20);
992		_mm256_store_pd(x + 14 + 2MAT_BLK_SZ, acc21);
993		_mm256_store_pd(x + 24 + 2MAT_BLK_SZ, acc22);
994		_mm256_store_pd(x + 34 + 2MAT_BLK_SZ, acc23);
995
996		}
	1281	}
	1282
	1283
	1284
	1285
	1286	#endif
	1287
	1288
	1289
997	1290
998	1291	static inline
999	1292	void muladd_all_by_32(long first, long last, double x, const double a, const double *b, long n)
1000	1293	{
1001	1294	long i = first;
1002		#ifdef NTL_HAVE_FMA
1003		// processing three rows at a time is faster
	1295	#if (defined(NTL_HAVE_FMA) \|\| defined(NTL_HAVE_AVX512F))
	1296	// process three rows at a time
1004	1297	for (; i <= last-3; i+=3)
1005	1298	muladd3_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
1006	1299	for (; i < last; i++)

1019	1312	void muladd_all_by_16(long first, long last, double x, const double a, const double *b, long n)
1020	1313	{
1021	1314	long i = first;
1022		#ifdef NTL_HAVE_FMA
	1315	#if (defined(NTL_HAVE_FMA) \|\| defined(NTL_HAVE_AVX512F))
1023	1316	// processing three rows at a time is faster
1024	1317	for (; i <= last-3; i+=3)
1025	1318	muladd3_by_16(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);

3416	3709	// multiply row k by pivot_inv
3417	3710	long t1 = pivot_inv;
3418	3711	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3419		long * NTL_RESTRICT y = &M[k][0];
	3712	long *y = &M[k][0];
3420	3713	for (long j = 0; j < n; j++)
3421	3714	y[j] = MulModPrecon(y[j], t1, p, t1pinv);
3422	3715

3429	3722	NTL_IMPORT(p)
3430	3723	NTL_IMPORT(n)
3431	3724	NTL_IMPORT(k)
3432		long * NTL_RESTRICT y = &M[k][0];
	3725	long *y = &M[k][0];
3433	3726	for (long i = first; i < last; i++) {
3434	3727	if (i == k) continue; // skip row k
3435	3728
3436		long * NTL_RESTRICT x = &M[i][0];
	3729	long *x = &M[i][0];
3437	3730	long t1 = x[k];
3438	3731	t1 = NegateMod(t1, p);
3439	3732	x[k] = 0;

3459	3752	// pivot colums, using reverse swap sequence
3460	3753
3461	3754	for (long i = 0; i < n; i++) {
3462		long * NTL_RESTRICT x = &M[i][0];
	3755	long *x = &M[i][0];
3463	3756
3464	3757	for (long k = n-1; k >= 0; k--) {
3465	3758	long pos = P[k];

3566	3859	// multiply row k by pivot_inv
3567	3860	long t1 = pivot_inv;
3568	3861	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3569		unsigned long * NTL_RESTRICT y = &M[k][0];
	3862	unsigned long *y = &M[k][0];
3570	3863	for (long j = 0; j < n; j++) {
3571	3864	long t2 = rem(y[j], p, red_struct);
3572	3865	y[j] = MulModPrecon(t2, t1, p, t1pinv);

3581	3874	NTL_IMPORT(n)
3582	3875	NTL_IMPORT(k)
3583	3876	NTL_IMPORT(red_struct)
3584		unsigned long * NTL_RESTRICT y = &M[k][0];
	3877	unsigned long *y = &M[k][0];
3585	3878	if (cleanup) {
3586	3879	for (long i = first; i < last; i++) {
3587	3880	if (i == k) continue;
3588	3881	// skip row k: the data won't change, but it
3589	3882	// technically is a race condition in a multi-theaded
3590		// execution, and it would violate the "restrict"
3591		// contract
3592
3593		unsigned long * NTL_RESTRICT x = &M[i][0];
	3883	// execution
	3884
	3885	unsigned long *x = &M[i][0];
3594	3886	for (long j = 0; j < n; j++) {
3595	3887	x[j] = rem(x[j], p, red_struct);
3596	3888	}

3601	3893	for (long i = first; i < last; i++) {
3602	3894	if (i == k) continue; // skip row k
3603	3895
3604		unsigned long * NTL_RESTRICT x = &M[i][0];
	3896	unsigned long *x = &M[i][0];
3605	3897	long t1 = rem(x[k], p, red_struct);
3606	3898	t1 = NegateMod(t1, p);
3607	3899	x[k] = 0;

3636	3928	// pivot colums, using reverse swap sequence
3637	3929
3638	3930	for (long i = 0; i < n; i++) {
3639		unsigned long * NTL_RESTRICT x = &M[i][0];
	3931	unsigned long *x = &M[i][0];
3640	3932
3641	3933	for (long k = n-1; k >= 0; k--) {
3642	3934	long pos = P[k];

3744	4036	// multiply row k by pivot_inv
3745	4037	long t1 = pivot_inv;
3746	4038	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3747		double * NTL_RESTRICT y = &M[k][0];
	4039	double *y = &M[k][0];
3748	4040	for (long j = 0; j < n; j++) {
3749	4041	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
3750	4042	y[j] = MulModPrecon(t2, t1, p, t1pinv);

3759	4051	NTL_IMPORT(n)
3760	4052	NTL_IMPORT(k)
3761	4053	NTL_IMPORT(red_struct)
3762		double * NTL_RESTRICT y = &M[k][0];
	4054	double *y = &M[k][0];
3763	4055	if (cleanup) {
3764	4056	for (long i = first; i < last; i++) {
3765	4057	if (i == k) continue;
3766	4058	// skip row k: the data won't change, but it
3767	4059	// technically is a race condition in a multi-theaded
3768		// execution, and it would violate the "restrict"
3769		// contract
3770
3771		double * NTL_RESTRICT x = &M[i][0];
	4060	// execution
	4061
	4062	double *x = &M[i][0];
3772	4063	for (long j = 0; j < n; j++) {
3773	4064	x[j] = rem((unsigned long)(long)x[j], p, red_struct);
3774	4065	}

3779	4070	for (long i = first; i < last; i++) {
3780	4071	if (i == k) continue; // skip row k
3781	4072
3782		double * NTL_RESTRICT x = &M[i][0];
	4073	double *x = &M[i][0];
3783	4074	long t1 = rem((unsigned long)(long)x[k], p, red_struct);
3784	4075	t1 = NegateMod(t1, p);
3785	4076	x[k] = 0;

3802	4093	// pivot colums, using reverse swap sequence
3803	4094
3804	4095	for (long i = 0; i < n; i++) {
3805		double * NTL_RESTRICT x = &M[i][0];
	4096	double *x = &M[i][0];
3806	4097
3807	4098	for (long k = n-1; k >= 0; k--) {
3808	4099	long pos = P[k];

3901	4192	}
3902	4193
3903	4194	red_count = red_count-MAT_BLK_SZ;
3904		double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	4195	double *kpanelp = &M[kpanel][0];
3905	4196
3906	4197	if (cleanup) {
3907	4198	for (long r = 0; r < n*MAT_BLK_SZ; r++)

3929	4220	return;
3930	4221	}
3931	4222
3932		double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	4223	double y = &kpanelp[kMAT_BLK_SZ];
3933	4224	if (k != pos) {
3934	4225	// swap rows pos and k
3935		double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	4226	double x = &kpanelp[posMAT_BLK_SZ];
3936	4227	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
3937	4228
3938	4229	det = NegateMod(det, p);

3957	4248	for (long i = 0; i < n; i++) {
3958	4249	if (i == k) continue; // skip row k
3959	4250
3960		double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	4251	double x = &kpanelp[iMAT_BLK_SZ];
3961	4252	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
3962	4253	t1 = NegateMod(t1, p);
3963	4254	x[k-kk] = 0;

3999	4290	for (long jpanel = first; jpanel < last; jpanel++) {
4000	4291	if (jpanel == kpanel) continue;
4001	4292
4002		double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	4293	double *jpanelp = &M[jpanel][0];
4003	4294
4004	4295	if (cleanup) {
4005	4296	for (long r = 0; r < n*MAT_BLK_SZ; r++)

4011	4302	long pos = P[k];
4012	4303	if (pos != k) {
4013	4304	// swap rows pos and k
4014		double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4015		double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	4305	double pos_p = &jpanelp[posMAT_BLK_SZ];
	4306	double k_p = &jpanelp[kMAT_BLK_SZ];
4016	4307	for (long j = 0; j < MAT_BLK_SZ; j++)
4017	4308	_ntl_swap(pos_p[j], k_p[j]);
4018	4309	}

4045	4336	if (pos != k) {
4046	4337	// swap columns pos and k
4047	4338
4048		double * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4049		double * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	4339	double *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	4340	double *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4050	4341	for (long i = 0; i < n; i++) {
4051	4342	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
4052	4343	}

4154	4445	}
4155	4446
4156	4447	red_count = red_count-MAT_BLK_SZ;
4157		unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	4448	unsigned long *kpanelp = &M[kpanel][0];
4158	4449
4159	4450	if (cleanup) {
4160	4451	for (long r = 0; r < n*MAT_BLK_SZ; r++)

4182	4473	return;
4183	4474	}
4184	4475
4185		unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	4476	unsigned long y = &kpanelp[kMAT_BLK_SZ];
4186	4477	if (k != pos) {
4187	4478	// swap rows pos and k
4188		unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	4479	unsigned long x = &kpanelp[posMAT_BLK_SZ];
4189	4480	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4190	4481
4191	4482	det = NegateMod(det, p);

4210	4501	for (long i = 0; i < n; i++) {
4211	4502	if (i == k) continue; // skip row k
4212	4503
4213		unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	4504	unsigned long x = &kpanelp[iMAT_BLK_SZ];
4214	4505	long t1 = rem(x[k-kk], p, red_struct);
4215	4506	t1 = NegateMod(t1, p);
4216	4507	x[k-kk] = 0;

4252	4543	for (long jpanel = first; jpanel < last; jpanel++) {
4253	4544	if (jpanel == kpanel) continue;
4254	4545
4255		unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	4546	unsigned long *jpanelp = &M[jpanel][0];
4256	4547
4257	4548	if (cleanup) {
4258	4549	for (long r = 0; r < n*MAT_BLK_SZ; r++)

4264	4555	long pos = P[k];
4265	4556	if (pos != k) {
4266	4557	// swap rows pos and k
4267		unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4268		unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	4558	unsigned long pos_p = &jpanelp[posMAT_BLK_SZ];
	4559	unsigned long k_p = &jpanelp[kMAT_BLK_SZ];
4269	4560	for (long j = 0; j < MAT_BLK_SZ; j++)
4270	4561	_ntl_swap(pos_p[j], k_p[j]);
4271	4562	}

4301	4592	if (pos != k) {
4302	4593	// swap columns pos and k
4303	4594
4304		unsigned long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4305		unsigned long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	4595	unsigned long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	4596	unsigned long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4306	4597	for (long i = 0; i < n; i++) {
4307	4598	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
4308	4599	}

4397	4688	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4398	4689	long k_max = min(kk+MAT_BLK_SZ, n);
4399	4690
4400		long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	4691	long *kpanelp = &M[kpanel][0];
4401	4692
4402	4693
4403	4694	for (long k = kk; k < k_max; k++) {

4421	4712	return;
4422	4713	}
4423	4714
4424		long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	4715	long y = &kpanelp[kMAT_BLK_SZ];
4425	4716	if (k != pos) {
4426	4717	// swap rows pos and k
4427		long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	4718	long x = &kpanelp[posMAT_BLK_SZ];
4428	4719	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4429	4720
4430	4721	det = NegateMod(det, p);

4448	4739	for (long i = 0; i < n; i++) {
4449	4740	if (i == k) continue; // skip row k
4450	4741
4451		long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	4742	long x = &kpanelp[iMAT_BLK_SZ];
4452	4743	long t1 = x[k-kk];
4453	4744	t1 = NegateMod(t1, p);
4454	4745	x[k-kk] = 0;

4487	4778	for (long jpanel = first; jpanel < last; jpanel++) {
4488	4779	if (jpanel == kpanel) continue;
4489	4780
4490		long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	4781	long *jpanelp = &M[jpanel][0];
4491	4782
4492	4783	// perform swaps
4493	4784	for (long k = kk; k < k_max; k++) {
4494	4785	long pos = P[k];
4495	4786	if (pos != k) {
4496	4787	// swap rows pos and k
4497		long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4498		long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	4788	long pos_p = &jpanelp[posMAT_BLK_SZ];
	4789	long k_p = &jpanelp[kMAT_BLK_SZ];
4499	4790	for (long j = 0; j < MAT_BLK_SZ; j++)
4500	4791	_ntl_swap(pos_p[j], k_p[j]);
4501	4792	}

4532	4823	if (pos != k) {
4533	4824	// swap columns pos and k
4534	4825
4535		long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4536		long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	4826	long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	4827	long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4537	4828	for (long i = 0; i < n; i++) {
4538	4829	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
4539	4830	}

4743	5034	// multiply row k by pivot_inv
4744	5035	long t1 = pivot_inv;
4745	5036	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4746		long * NTL_RESTRICT y = &M[k][0];
	5037	long *y = &M[k][0];
4747	5038	// adjust
4748	5039	for (long j = k+1; j < n; j++)
4749	5040	y[j] = MulModPrecon(y[j], t1, p, t1pinv);

4762	5053	NTL_IMPORT(p)
4763	5054	NTL_IMPORT(n)
4764	5055	NTL_IMPORT(k)
4765		long * NTL_RESTRICT y = &M[k][0];
	5056	long *y = &M[k][0];
4766	5057
4767	5058	// adjust
4768	5059	for (long ii = first; ii < last; ii++) {
4769	5060	long i = ii + k+1;
4770	5061
4771		long * NTL_RESTRICT x = &M[i][0];
	5062	long *x = &M[i][0];
4772	5063	long t1 = x[k];
4773	5064	t1 = NegateMod(t1, p);
4774	5065	// adjust // x[k] = 0;

4928	5219	// multiply row k by pivot_inv
4929	5220	long t1 = pivot_inv;
4930	5221	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4931		unsigned long * NTL_RESTRICT y = &M[k][0];
	5222	unsigned long *y = &M[k][0];
4932	5223	for (long j = k+1; j < n; j++) {
4933	5224	long t2 = rem(y[j], p, red_struct);
4934	5225	y[j] = MulModPrecon(t2, t1, p, t1pinv);

4945	5236	NTL_IMPORT(n)
4946	5237	NTL_IMPORT(k)
4947	5238	NTL_IMPORT(red_struct)
4948		unsigned long * NTL_RESTRICT y = &M[k][0];
	5239	unsigned long *y = &M[k][0];
4949	5240	if (cleanup) {
4950	5241	for (long ii = first; ii < last; ii++) {
4951	5242	long i = ii + k+1;
4952	5243
4953		unsigned long * NTL_RESTRICT x = &M[i][0];
	5244	unsigned long *x = &M[i][0];
4954	5245	for (long j = k+1; j < n; j++) {
4955	5246	x[j] = rem(x[j], p, red_struct);
4956	5247	}

4961	5252	for (long ii = first; ii < last; ii++) {
4962	5253	long i = ii + k+1;
4963	5254
4964		unsigned long * NTL_RESTRICT x = &M[i][0];
	5255	unsigned long *x = &M[i][0];
4965	5256	long t1 = rem(x[k], p, red_struct);
4966	5257	t1 = NegateMod(t1, p);
4967	5258	if (t1 == 0) continue;

5124	5415	// multiply row k by pivot_inv
5125	5416	long t1 = pivot_inv;
5126	5417	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5127		double * NTL_RESTRICT y = &M[k][0];
	5418	double *y = &M[k][0];
5128	5419	for (long j = k+1; j < n; j++) {
5129	5420	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
5130	5421	y[j] = MulModPrecon(t2, t1, p, t1pinv);

5141	5432	NTL_IMPORT(n)
5142	5433	NTL_IMPORT(k)
5143	5434	NTL_IMPORT(red_struct)
5144		double * NTL_RESTRICT y = &M[k][0];
	5435	double *y = &M[k][0];
5145	5436	if (cleanup) {
5146	5437	for (long ii = first; ii < last; ii++) {
5147	5438	long i = ii + k+1;
5148	5439
5149		double * NTL_RESTRICT x = &M[i][0];
	5440	double *x = &M[i][0];
5150	5441	for (long j = k+1; j < n; j++) {
5151	5442	x[j] = rem((unsigned long)(long)x[j], p, red_struct);
5152	5443	}

5160	5451	for (long ii = first; ii < last; ii++) {
5161	5452	long i = ii + k+1;
5162	5453
5163		double * NTL_RESTRICT x = &M[i][0];
	5454	double *x = &M[i][0];
5164	5455	long t1 = rem((unsigned long)(long)x[k], p, red_struct);
5165	5456	t1 = NegateMod(t1, p);
5166	5457	if (t1 == 0) continue;

5303	5594	}
5304	5595
5305	5596	red_count = red_count-MAT_BLK_SZ;
5306		double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	5597	double *kpanelp = &M[kpanel][0];
5307	5598
5308	5599	if (cleanup) {
5309	5600	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)

5331	5622	return;
5332	5623	}
5333	5624
5334		double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	5625	double y = &kpanelp[kMAT_BLK_SZ];
5335	5626	if (k != pos) {
5336	5627	// swap rows pos and k
5337		double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	5628	double x = &kpanelp[posMAT_BLK_SZ];
5338	5629	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5339	5630
5340	5631	det = NegateMod(det, p);

5363	5654	for (long i = kk; i < n; i++) {
5364	5655	if (i == k) continue; // skip row k
5365	5656
5366		double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	5657	double x = &kpanelp[iMAT_BLK_SZ];
5367	5658	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
5368	5659	t1 = NegateMod(t1, p);
5369	5660	x[k-kk] = 0;

5412	5703	for (long index = first; index < last; index++) {
5413	5704	long jpanel = index + kpanel+1;
5414	5705
5415		double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	5706	double *jpanelp = &M[jpanel][0];
5416	5707
5417	5708	if (cleanup) {
5418	5709	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)

5424	5715	long pos = P[k];
5425	5716	if (pos != k) {
5426	5717	// swap rows pos and k
5427		double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5428		double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	5718	double pos_p = &jpanelp[posMAT_BLK_SZ];
	5719	double k_p = &jpanelp[kMAT_BLK_SZ];
5429	5720	for (long j = 0; j < MAT_BLK_SZ; j++)
5430	5721	_ntl_swap(pos_p[j], k_p[j]);
5431	5722	}

5574	5865	}
5575	5866
5576	5867	red_count = red_count-MAT_BLK_SZ;
5577		unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	5868	unsigned long *kpanelp = &M[kpanel][0];
5578	5869
5579	5870	if (cleanup) {
5580	5871	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)

5602	5893	return;
5603	5894	}
5604	5895
5605		unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	5896	unsigned long y = &kpanelp[kMAT_BLK_SZ];
5606	5897	if (k != pos) {
5607	5898	// swap rows pos and k
5608		unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	5899	unsigned long x = &kpanelp[posMAT_BLK_SZ];
5609	5900	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5610	5901
5611	5902	det = NegateMod(det, p);

5634	5925	for (long i = kk; i < n; i++) {
5635	5926	if (i == k) continue; // skip row k
5636	5927
5637		unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	5928	unsigned long x = &kpanelp[iMAT_BLK_SZ];
5638	5929	long t1 = rem(x[k-kk], p, red_struct);
5639	5930	t1 = NegateMod(t1, p);
5640	5931	x[k-kk] = 0;

5682	5973	for (long index = first; index < last; index++) {
5683	5974	long jpanel = index + kpanel+1;
5684	5975
5685		unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	5976	unsigned long *jpanelp = &M[jpanel][0];
5686	5977
5687	5978	if (cleanup) {
5688	5979	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)

5694	5985	long pos = P[k];
5695	5986	if (pos != k) {
5696	5987	// swap rows pos and k
5697		unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5698		unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	5988	unsigned long pos_p = &jpanelp[posMAT_BLK_SZ];
	5989	unsigned long k_p = &jpanelp[kMAT_BLK_SZ];
5699	5990	for (long j = 0; j < MAT_BLK_SZ; j++)
5700	5991	_ntl_swap(pos_p[j], k_p[j]);
5701	5992	}

5829	6120	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5830	6121	long k_max = min(kk+MAT_BLK_SZ, n);
5831	6122
5832		long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	6123	long *kpanelp = &M[kpanel][0];
5833	6124
5834	6125	for (long k = kk; k < k_max; k++) {
5835	6126

5852	6143	return;
5853	6144	}
5854	6145
5855		long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	6146	long y = &kpanelp[kMAT_BLK_SZ];
5856	6147	if (k != pos) {
5857	6148	// swap rows pos and k
5858		long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	6149	long x = &kpanelp[posMAT_BLK_SZ];
5859	6150	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5860	6151
5861	6152	det = NegateMod(det, p);

5883	6174	for (long i = kk; i < n; i++) {
5884	6175	if (i == k) continue; // skip row k
5885	6176
5886		long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	6177	long x = &kpanelp[iMAT_BLK_SZ];
5887	6178	long t1 = x[k-kk];
5888	6179	t1 = NegateMod(t1, p);
5889	6180	x[k-kk] = 0;

5928	6219	for (long index = first; index < last; index++) {
5929	6220	long jpanel = index + kpanel+1;
5930	6221
5931		long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	6222	long *jpanelp = &M[jpanel][0];
5932	6223
5933	6224	// perform swaps
5934	6225	for (long k = kk; k < k_max; k++) {
5935	6226	long pos = P[k];
5936	6227	if (pos != k) {
5937	6228	// swap rows pos and k
5938		long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5939		long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	6229	long pos_p = &jpanelp[posMAT_BLK_SZ];
	6230	long k_p = &jpanelp[kMAT_BLK_SZ];
5940	6231	for (long j = 0; j < MAT_BLK_SZ; j++)
5941	6232	_ntl_swap(pos_p[j], k_p[j]);
5942	6233	}

6181	6472	NTL_IMPORT(n)
6182	6473	NTL_IMPORT(k)
6183	6474	NTL_IMPORT(r)
6184		long * NTL_RESTRICT y = &M[r][0];
	6475	long *y = &M[r][0];
6185	6476
6186	6477	for (long ii = first; ii < last; ii++) {
6187	6478	long i = ii + r+1;
6188	6479
6189		long * NTL_RESTRICT x = &M[i][0];
	6480	long *x = &M[i][0];
6190	6481	long t1 = x[k];
6191	6482	t1 = MulMod(t1, pivot_inv, p);
6192	6483	t1 = NegateMod(t1, p);

6334	6625	static inline
6335	6626	void SwapOneRow(double *panelp, long i, long pos)
6336	6627	{
6337		double * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6338		double * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	6628	double pos_p = &panelp[posMAT_BLK_SZ];
	6629	double i_p = &panelp[iMAT_BLK_SZ];
6339	6630	for (long j = 0; j < MAT_BLK_SZ; j++)
6340	6631	_ntl_swap(pos_p[j], i_p[j]);
6341	6632	}

6416	6707
6417	6708	AlignedArray<double> aux_panel_store;
6418	6709	aux_panel_store.SetLength(n*MAT_BLK_SZ);
6419		double * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	6710	double *aux_panel = &aux_panel_store[0];
6420	6711
6421	6712
6422	6713	AlignedArray<double> buf_store1;

6482	6773	kpanel++;
6483	6774	}
6484	6775
6485		double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	6776	double *kpanelp = &M[kpanel][0];
6486	6777
6487	6778	if (k == kk) { // a fresh kpanel -- special processing
6488	6779

6530	6821	continue;
6531	6822	}
6532	6823
6533		double * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
6534		double * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	6824	double y = &kpanelp[rMAT_BLK_SZ];
	6825	double y1 = &aux_panel[rMAT_BLK_SZ];
6535	6826	if (r != pos) {
6536	6827	// swap rows pos and r
6537		double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
6538		double * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	6828	double x = &kpanelp[posMAT_BLK_SZ];
	6829	double x1 = &aux_panel[posMAT_BLK_SZ];
6539	6830
6540	6831	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6541	6832	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);

6552	6843
6553	6844	// clear column
6554	6845	for (long i = r+1; i < n; i++) {
6555		double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
6556		double * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	6846	double x = &kpanelp[iMAT_BLK_SZ];
	6847	double x1 = &aux_panel[iMAT_BLK_SZ];
6557	6848	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
6558	6849	t1 = MulMod(t1, pivot_inv, p);
6559	6850	t1 = NegateMod(t1, p);

6603	6894	for (long index = first; index < last; index++) {
6604	6895	long jpanel = index + kpanel+1;
6605	6896
6606		double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	6897	double *jpanelp = &M[jpanel][0];
6607	6898
6608	6899	if (cleanup) {
6609	6900	for (long h = 0; h < n*MAT_BLK_SZ; h++)

6821	7112	static inline
6822	7113	void SwapOneRow(unsigned long *panelp, long i, long pos)
6823	7114	{
6824		unsigned long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6825		unsigned long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	7115	unsigned long pos_p = &panelp[posMAT_BLK_SZ];
	7116	unsigned long i_p = &panelp[iMAT_BLK_SZ];
6826	7117	for (long j = 0; j < MAT_BLK_SZ; j++)
6827	7118	_ntl_swap(pos_p[j], i_p[j]);
6828	7119	}

6904	7195
6905	7196	UniqueArray<unsigned long> aux_panel_store;
6906	7197	aux_panel_store.SetLength(n*MAT_BLK_SZ);
6907		unsigned long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	7198	unsigned long *aux_panel = &aux_panel_store[0];
6908	7199
6909	7200
6910	7201	UniqueArray<unsigned long> buf_store1;

6975	7266	kpanel++;
6976	7267	}
6977	7268
6978		unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	7269	unsigned long *kpanelp = &M[kpanel][0];
6979	7270
6980	7271	if (k == kk) { // a fresh kpanel -- special processing
6981	7272

7025	7316	continue;
7026	7317	}
7027	7318
7028		unsigned long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
7029		unsigned long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	7319	unsigned long y = &kpanelp[rMAT_BLK_SZ];
	7320	unsigned long y1 = &aux_panel[rMAT_BLK_SZ];
7030	7321	if (r != pos) {
7031	7322	// swap rows pos and r
7032		unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
7033		unsigned long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	7323	unsigned long x = &kpanelp[posMAT_BLK_SZ];
	7324	unsigned long x1 = &aux_panel[posMAT_BLK_SZ];
7034	7325
7035	7326	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7036	7327	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);

7047	7338
7048	7339	// clear column
7049	7340	for (long i = r+1; i < n; i++) {
7050		unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
7051		unsigned long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	7341	unsigned long x = &kpanelp[iMAT_BLK_SZ];
	7342	unsigned long x1 = &aux_panel[iMAT_BLK_SZ];
7052	7343	long t1 = rem(x[k-kk], p, red_struct);
7053	7344	t1 = MulMod(t1, pivot_inv, p);
7054	7345	t1 = NegateMod(t1, p);

7098	7389	for (long index = first; index < last; index++) {
7099	7390	long jpanel = index + kpanel+1;
7100	7391
7101		unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	7392	unsigned long *jpanelp = &M[jpanel][0];
7102	7393
7103	7394	if (cleanup) {
7104	7395	for (long h = 0; h < n*MAT_BLK_SZ; h++)

7323	7614	static inline
7324	7615	void SwapOneRow(long *panelp, long i, long pos)
7325	7616	{
7326		long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
7327		long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	7617	long pos_p = &panelp[posMAT_BLK_SZ];
	7618	long i_p = &panelp[iMAT_BLK_SZ];
7328	7619	for (long j = 0; j < MAT_BLK_SZ; j++)
7329	7620	_ntl_swap(pos_p[j], i_p[j]);
7330	7621	}

7408	7699
7409	7700	UniqueArray<long> aux_panel_store;
7410	7701	aux_panel_store.SetLength(n*MAT_BLK_SZ);
7411		long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	7702	long *aux_panel = &aux_panel_store[0];
7412	7703
7413	7704
7414	7705	UniqueArray<long> buf_store1;

7462	7753	kpanel++;
7463	7754	}
7464	7755
7465		long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	7756	long *kpanelp = &M[kpanel][0];
7466	7757
7467	7758	if (k == kk) { // a fresh kpanel -- special processing
7468	7759

7504	7795	continue;
7505	7796	}
7506	7797
7507		long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
7508		long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	7798	long y = &kpanelp[rMAT_BLK_SZ];
	7799	long y1 = &aux_panel[rMAT_BLK_SZ];
7509	7800	if (r != pos) {
7510	7801	// swap rows pos and r
7511		long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
7512		long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	7802	long x = &kpanelp[posMAT_BLK_SZ];
	7803	long x1 = &aux_panel[posMAT_BLK_SZ];
7513	7804
7514	7805	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7515	7806	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);

7520	7811
7521	7812	// clear column
7522	7813	for (long i = r+1; i < n; i++) {
7523		long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
7524		long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	7814	long x = &kpanelp[iMAT_BLK_SZ];
	7815	long x1 = &aux_panel[iMAT_BLK_SZ];
7525	7816	long t1 = x[k-kk];
7526	7817	t1 = MulMod(t1, pivot_inv, p);
7527	7818	t1 = NegateMod(t1, p);

7568	7859	for (long index = first; index < last; index++) {
7569	7860	long jpanel = index + kpanel+1;
7570	7861
7571		long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	7862	long *jpanelp = &M[jpanel][0];
7572	7863
7573	7864	// perform swaps
7574	7865	ApplySwaps(jpanelp, rr, r, P);

+28

-18

src/mfile less more

138	138	lzz_pEX.o lzz_pEXFactoring.o lzz_pX.o lzz_pX1.o lzz_pXCharPoly.o \
139	139	lzz_pXFactoring.o mat_GF2.o mat_GF2E.o mat_RR.o mat_ZZ.o mat_ZZ_p.o mat_ZZ_pE.o \
140	140	mat_lzz_p.o mat_lzz_pE.o mat_poly_ZZ.o mat_poly_ZZ_p.o mat_poly_lzz_p.o \
141		quad_float.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
	141	quad_float.o quad_float1.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
142	142	vec_ZZ_pE.o vec_lzz_p.o vec_lzz_pE.o xdouble.o G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o \
143		G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o
	143	G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o pd_FFT.o
144	144
145	145	# library source files
146	146	SRC=FFT.cpp FacVec.cpp GF2.cpp GF2E.cpp GF2EX.cpp GF2EXFactoring.cpp GF2X.cpp \

152	152	lzz_pX.cpp lzz_pX1.cpp lzz_pXCharPoly.cpp lzz_pXFactoring.cpp mat_GF2.cpp \
153	153	mat_GF2E.cpp mat_RR.cpp mat_ZZ.cpp mat_ZZ_p.cpp mat_ZZ_pE.cpp mat_lzz_p.cpp \
154	154	mat_lzz_pE.cpp mat_poly_ZZ.cpp mat_poly_ZZ_p.cpp mat_poly_lzz_p.cpp \
155		quad_float.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
	155	quad_float.cpp quad_float1.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
156	156	vec_ZZ_p.cpp vec_ZZ_pE.cpp vec_lzz_p.cpp vec_lzz_pE.cpp xdouble.cpp \
157	157	G_LLL_FP.cpp G_LLL_QP.cpp G_LLL_XD.cpp G_LLL_RR.cpp thread.cpp \
158		BasicThreadPool.cpp MatPrime.cpp
	158	BasicThreadPool.cpp MatPrime.cpp pd_FFT.cpp
159	159
160	160
161	161
162	162	# library header files
163		INCL=FFT.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
	163	INCL=FFT.h FFT_impl.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
164	164	GF2XFactoring.h GF2XVec.h HNF.h ctools.h LLL.h RR.h WordVector.h \
165	165	ZZ.h ZZ_limbs.h sp_arith.h ZZVec.h ZZX.h ZZXFactoring.h ZZ_p.h ZZ_pE.h ZZ_pEX.h \
166	166	ZZ_pEXFactoring.h ZZ_pX.h ZZ_pXFactoring.h fileio.h lip.h lzz_p.h lzz_pE.h \

174	174	vec_vec_GF2E.h vec_vec_RR.h vec_vec_ZZ.h vec_vec_ZZ_p.h vec_vec_ZZ_pE.h \
175	175	vec_vec_long.h vec_vec_lzz_p.h vec_vec_lzz_pE.h vec_xdouble.h xdouble.h \
176	176	config.h version.h new.h vec_ulong.h vec_vec_ulong.h SmartPtr.h \
177		Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h
	177	Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h PD.h pd_FFT.h
178	178
179	179
180	180

186	186
187	187
188	188	# test source files
189		TS=QuickTest.cpp ZZTest.cpp BerlekampTest.cpp CanZassTest.cpp ZZXFacTest.cpp \
190		MoreFacTest.cpp LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
191		CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp \
192		BitMatTest.cpp ZZ_pEXTest.cpp lzz_pEXTest.cpp Timing.cpp ThreadTest.cpp \
193		ExceptionTest.cpp
	189	TS=QuickTest.cpp ZZTest.cpp SSMulTest.cpp ZZ_pXTest.cpp lzz_pXTest.cpp BerlekampTest.cpp \
	190	CanZassTest.cpp ZZXFacTest.cpp MoreFacTest.cpp \
	191	LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
	192	CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp GF2EXGCDTest.cpp \
	193	BitMatTest.cpp ZZ_pEXTest.cpp ZZ_pEXGCDTest.cpp lzz_pEXTest.cpp lzz_pEXGCDTest.cpp \
	194	Timing.cpp ThreadTest.cpp ExceptionTest.cpp
	195
	196	# aux source to help compute crossovers
	197	CROSS=GF2EXDivCross.cpp GF2EXGCDCross.cpp GF2EXKarCross.cpp GF2EXModCross.cpp
	198
194	199
195	200	# scripts
196	201	SCRIPTS=MakeGetTime MakeGetPID MakeCheckFeatures ResetFeatures CopyFeatures \

207	212	AUXPROGS = TestGetTime TestGetPID CheckFeatures CheckCompile GenConfigInfo CheckContract \
208	213	CheckThreads
209	214
210		FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA \
	215	FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA AVX512F \
211	216	COPY_TRAITS1 COPY_TRAITS2 CHRONO_TIME MACOS_TIME POSIX_TIME
212	217
213	218

262	267
263	268
264	269
265		DOC = $(DFILES) $(HTFILES)
	270	DOC = $(DFILES) $(HTFILES) TFT-time.jpg zmulrat.jpg flintrat.jpg
266	271
267	272
268	273	# test program executables
269		PROGS=QuickTest ZZTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest \
	274	PROGS=QuickTest ZZTest SSMulTest ZZ_pXTest lzz_pXTest BerlekampTest CanZassTest \
	275	ZZXFacTest MoreFacTest LLLTest \
270	276	BitMatTest MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest GF2XTest \
271		GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
	277	GF2EXTest GF2EXGCDTest subset ZZ_pEXTest ZZ_pEXGCDTest lzz_pEXTest lzz_pEXGCDTest \
	278	Timing ThreadTest
272	279
273	280	# things to save to a tar file
274		SFILES=makefile $(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win \
	281	SFILES=$(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) $(CROSS) mach_desc.win \
275	282	Poly1TimeTest.cpp Poly2TimeTest.cpp Poly3TimeTest.cpp GF2XTimeTest.cpp \
276	283	InitSettings.cpp DispSettings.cpp WizardAux Wizard
277	284

368	375
369	376	quad_float.o: quad_float.cpp
370	377	$(LCOMP) $(COMPILE) $(NOCONTRACT) quad_float.cpp
	378
	379	pd_FFT.o: pd_FFT.cpp
	380	$(LCOMP) $(COMPILE) $(NOCONTRACT) pd_FFT.cpp
371	381
372	382	CheckCompile: CheckCompile.cpp
373	383	$(LINK) -o CheckCompile CheckCompile.cpp $(LDLIBS)

524	534	rm -f `cat WINDIR`.zip
525	535	mv dos `cat WINDIR`
526	536	chmod -R a+rX `cat WINDIR`
527		find ./`cat WINDIR` '!' -name '*.gif' -print \| zip -l `cat WINDIR` -@
528		find ./`cat WINDIR` -name '*.gif' -print \| zip -u `cat WINDIR` -@
	537	find ./`cat WINDIR` '!' '(' -name '.gif' -o -name '.jpg' ')' -print \| zip -l `cat WINDIR` -@
	538	find ./`cat WINDIR` -name '.gif' -o -name '.jpg' -print \| zip -u `cat WINDIR` -@
529	539	rm -rf `cat WINDIR`
530	540
531	541

+1129

-0

src/pd_FFT.cpp less more

	0
	1	// The configure script should define NTL_FP_CONTRACT_OFF
	2	// for icc via the NOCONTRACT variable
	3	#ifdef NTL_FP_CONTRACT_OFF
	4	#pragma fp_contract(off)
	5	#endif
	6
	7
	8	#include <NTL/tools.h>
	9
	10	#ifdef NTL_ENABLE_AVX_FFT
	11
	12	// The configure script tries to prevent this, but we
	13	// double check here. Note that while it is strongly
	14	// discouraged, other parts of NTL probably work even with
	15	// "fast math"; however, quad_float will definitely break.
	16
	17	#if (defined(__GNUC__) && __FAST_MATH__)
	18	#error "do not compile pd_FFT.cpp with -ffast-math!!"
	19	#endif
	20
	21
	22
	23	#include <NTL/PD.h>
	24	#include <NTL/pd_FFT.h>
	25	#include <NTL/FFT_impl.h>
	26
	27	#if (defined(__GNUC__) && __FAST_MATH__)
	28	#error "do not compile pd_FFT.cpp with -ffast-math!!"
	29	#endif
	30
	31	#if (NTL_FMA_DETECTED && !defined(NTL_CONTRACTION_FIXED))
	32	#error "contraction not fixed"
	33	#endif
	34
	35
	36	NTL_START_IMPL
	37
	38	#define NTL_CSR_NEAREST (0x00000000)
	39	#define NTL_CSR_DOWN (0x00002000)
	40	#define NTL_CSR_UP (0x00004000)
	41	#define NTL_CSR_TRUNC (0x00006000)
	42	#define NTL_CSR_MASK (0x00006000)
	43
	44	CSRPush::CSRPush()
	45	{
	46	// save current register value
	47	reg = _mm_getcsr();
	48	// set rounding mode to "down"
	49	_mm_setcsr((reg & ~NTL_CSR_MASK) \| NTL_CSR_DOWN);
	50	}
	51
	52	CSRPush::~CSRPush()
	53	{
	54	_mm_setcsr(reg);
	55	}
	56
	57
	58
	59	void
	60	pd_LazyPrepMulModPrecon_impl(double bninv, const double b, double n, long len)
	61	{
	62	for (long i = 0; i < len; i++) bninv[i] = b[i]/n;
	63	}
	64
	65
	66
	67	template<class pd> pd
	68	pd_LazyReduce1(pd a, double q)
	69	{
	70	return correct_excess(a, q);
	71	}
	72
	73	template<class pd> pd
	74	pd_LazyReduce2(pd a, double q)
	75	{
	76	return correct_excess(a, 2*q);
	77	}
	78
	79	// inputs in [0, 2n), output in [0, 4n)
	80	template<class pd> pd
	81	pd_LazyAddMod(pd a, pd b, double n)
	82	{
	83	return a+b;
	84	}
	85
	86	// inputs in [0, 2n), output in [0, 4n)
	87	template<class pd> pd
	88	pd_LazySubMod(pd a, pd b, double n)
	89	{
	90	return a-b+2*n;
	91	}
	92
	93	// inputs in [0, 2n), output in [0, 2n)
	94	template<class pd> pd
	95	pd_LazyAddMod2(pd a, pd b, double n)
	96	{
	97	pd r = a+b;
	98	return correct_excess(r, 2*n);
	99	}
	100
	101	// inputs in [0, 2n), output in [0, 2n)
	102	template<class pd> pd
	103	pd_LazySubMod2(pd a, pd b, double n)
	104	{
	105	pd r = a-b;
	106	return correct_deficit(r, 2*n);
	107	}
	108
	109	// inputs in [0, 4n), output in [0, 4n)
	110	template<class pd> pd
	111	pd_LazyAddMod4(pd a, pd b, double n)
	112	{
	113	pd r = a+b;
	114	return correct_excess(r, 4*n);
	115	}
	116
	117	// inputs in [0, 4n), output in [0, 4n)
	118	template<class pd> pd
	119	pd_LazySubMod4(pd a, pd b, double n)
	120	{
	121	pd r = a-b;
	122	return correct_deficit(r, 4*n);
	123	}
	124
	125
	126	// Input and output in [0, 4*n)
	127	template<class pd> pd
	128	pd_LazyDoubleMod4(pd a, double n)
	129	{
	130	return 2 * pd_LazyReduce2(a, n);
	131	}
	132
	133	// Input and output in [0, 2*n)
	134	template<class pd> pd
	135	pd_LazyDoubleMod2(pd a, double n)
	136	{
	137	return 2 * pd_LazyReduce1(a, n);
	138	}
	139
	140
	141
	142	// n in [0,2^50), b in [0,n), a in [0,4*n), bninv = RoundDown(b/n)
	143	// returns ab mod n in [0, 2n)
	144	template<class pd> pd
	145	pd_LazyMulModPrecon(pd a, pd b, double n, pd bninv)
	146	{
	147	pd hi = a*b;
	148	pd lo = fused_mulsub(a, b, hi); // hi+lo == a*b (exactly)
	149	pd q = fused_muladd(a, bninv, 1L << 52);
	150	q -= (1L << 52); // q is the correct quotient, or one too small
	151	pd d = fused_negmuladd(q, n, hi); // d == hi - q*n (exactly)
	152	pd r = d + lo; // r is the remainder, or the remainder plus n
	153
	154	return r;
	155	}
	156
	157	// return (a[0] + a[1], a[0] - a[1], a[2] + a[3], a[2] - a[3], ...)
	158	// all inputs and outputs in [0, 2*n)
	159	template<class pd> pd
	160	pd_fwd_butterfly_packed2(pd a, double n)
	161	{
	162	pd b = swap2(a);
	163	pd sum = pd_LazyAddMod(a, b, n);
	164	pd diff = pd_LazySubMod(b, a, n);
	165	pd res = blend2(sum, diff);
	166	res = pd_LazyReduce2(res, n);
	167	return res;
	168	}
	169
	170	// return (a[0] + a[2], a[1] + a[3], (a[0] - a[2]), (a[1] - a[3])*root, ...)
	171	// all inputs and outputs in [0, 2*n)
	172	// it is also assumed that w = (1,1,1,root,...) and wninv = RoundDown(w/n)
	173	template<class pd> pd
	174	pd_fwd_butterfly_packed4(pd a, pd w, double n, pd wninv)
	175	{
	176	pd b = swap4(a);
	177	pd sum = pd_LazyAddMod(a, b, n);
	178	pd diff = pd_LazySubMod(b, a, n);
	179	pd res = blend4(sum, diff);
	180	res = pd_LazyMulModPrecon(res, w, n, wninv);
	181	return res;
	182	}
	183
	184
	185	static double
	186	pd_LazyPrepMulModPrecon(long b, long n)
	187	{
	188	return double(b)/double(n);
	189	}
	190
	191
	192	//===================================
	193
	194
	195	#define NTL_PD_FFT_THRESH (11)
	196
	197	#define PDLGSZ NTL_LG2_PDSZ
	198	#define PDSZ NTL_PDSZ
	199
	200	#if (PDSZ == 8)
	201	typedef PD<8> pd_full;
	202	typedef PD<4> pd_half;
	203	typedef PD<2> pd_qrtr;
	204	#else
	205	typedef PD<4> pd_full;
	206	typedef PD<2> pd_half;
	207	#endif
	208
	209	#define PDLD pd_full::load
	210
	211
	212	// this assumes xx0, xx1, w, qinv are pd_half's
	213	#define fwd_butterfly_half(xx0, xx1, w, q, wqinv) \
	214	do \
	215	{ \
	216	pd_half x0_ = xx0; \
	217	pd_half x1_ = xx1; \
	218	pd_half t_ = pd_LazySubMod(x0_, x1_, q); \
	219	xx0 = pd_LazyAddMod2(x0_, x1_, q); \
	220	xx1 = pd_LazyMulModPrecon(t_, w, q, wqinv); \
	221	} \
	222	while (0)
	223
	224	// this assumes xx0, xx1, w, qinv are pd_full's
	225	#define fwd_butterfly_full(xx0, xx1, w, q, wqinv) \
	226	do \
	227	{ \
	228	pd_full x0_ = xx0; \
	229	pd_full x1_ = xx1; \
	230	pd_full t_ = pd_LazySubMod(x0_, x1_, q); \
	231	xx0 = pd_LazyAddMod2(x0_, x1_, q); \
	232	xx1 = pd_LazyMulModPrecon(t_, w, q, wqinv); \
	233	} \
	234	while (0)
	235
	236	// this assumes xx0_ptr, xx1_ptr, w_ptr, wqinv_ptr are double pointers
	237	// which are read/written as pd_full's.
	238	// In gcc, restrict keyword will help code gen.
	239	#define fwd_butterfly(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
	240	do \
	241	{ \
	242	pd_full x0_ = PDLD(xx0_ptr); \
	243	pd_full x1_ = PDLD(xx1_ptr); \
	244	pd_full w_ = PDLD(w_ptr); \
	245	pd_full wqinv_ = PDLD(wqinv_ptr); \
	246	pd_full t_ = pd_LazySubMod(x0_, x1_, q); \
	247	store(xx0_ptr, pd_LazyAddMod2(x0_, x1_, q)); \
	248	store(xx1_ptr, pd_LazyMulModPrecon(t_, w_, q, wqinv_)); \
	249	} \
	250	while (0)
	251
	252
	253	#if 0
	254	#define fwd_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
	255	do \
	256	{ \
	257	pd_full xx0_0_ = PDLD(xx0_ptr+0PDSZ); pd_full xx1_0_ = PDLD(xx1_ptr+0PDSZ); \
	258	pd_full xx0_1_ = PDLD(xx0_ptr+1PDSZ); pd_full xx1_1_ = PDLD(xx1_ptr+1PDSZ); \
	259	pd_full xx0_2_ = PDLD(xx0_ptr+2PDSZ); pd_full xx1_2_ = PDLD(xx1_ptr+2PDSZ); \
	260	pd_full xx0_3_ = PDLD(xx0_ptr+3PDSZ); pd_full xx1_3_ = PDLD(xx1_ptr+3PDSZ); \
	261	fwd_butterfly_full(xx0_0_, xx1_0_, PDLD(w_ptr+0PDSZ), q, PDLD(wqinv_ptr+0PDSZ)); \
	262	fwd_butterfly_full(xx0_1_, xx1_1_, PDLD(w_ptr+1PDSZ), q, PDLD(wqinv_ptr+1PDSZ)); \
	263	fwd_butterfly_full(xx0_2_, xx1_2_, PDLD(w_ptr+2PDSZ), q, PDLD(wqinv_ptr+2PDSZ)); \
	264	fwd_butterfly_full(xx0_3_, xx1_3_, PDLD(w_ptr+3PDSZ), q, PDLD(wqinv_ptr+3PDSZ)); \
	265	store(xx0_ptr+0PDSZ, xx0_0_); store(xx1_ptr+0PDSZ, xx1_0_); \
	266	store(xx0_ptr+1PDSZ, xx0_1_); store(xx1_ptr+1PDSZ, xx1_1_); \
	267	store(xx0_ptr+2PDSZ, xx0_2_); store(xx1_ptr+2PDSZ, xx1_2_); \
	268	store(xx0_ptr+3PDSZ, xx0_3_); store(xx1_ptr+3PDSZ, xx1_3_); \
	269	} \
	270	while(0)
	271	#else
	272	#define fwd_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
	273	do \
	274	{ \
	275	fwd_butterfly(xx0_ptr+0PDSZ, xx1_ptr+0PDSZ, w_ptr+0PDSZ, q, wqinv_ptr+0PDSZ); \
	276	fwd_butterfly(xx0_ptr+1PDSZ, xx1_ptr+1PDSZ, w_ptr+1PDSZ, q, wqinv_ptr+1PDSZ); \
	277	fwd_butterfly(xx0_ptr+2PDSZ, xx1_ptr+2PDSZ, w_ptr+2PDSZ, q, wqinv_ptr+2PDSZ); \
	278	fwd_butterfly(xx0_ptr+3PDSZ, xx1_ptr+3PDSZ, w_ptr+3PDSZ, q, wqinv_ptr+3PDSZ); \
	279	} \
	280	while(0)
	281	#endif
	282
	283
	284
	285
	286	static inline NTL_ALWAYS_INLINE void
	287	pd_fft_layer_inner_loop(double* NTL_RESTRICT xp0,
	288	double* NTL_RESTRICT xp1,
	289	long size,
	290	const double* NTL_RESTRICT wtab,
	291	const double* NTL_RESTRICT wqinvtab,
	292	double q)
	293
	294	{
	295	long j = 0;
	296	do {
	297	fwd_butterfly_x4(xp0+j, xp1+j, wtab+j, q, wqinvtab+j);
	298	j += 4*PDSZ;
	299	} while (j < size);
	300	}
	301
	302	// assumes size >= 8*PDSZ
	303	static inline NTL_ALWAYS_INLINE void
	304	pd_fft_layer(double* xp, long blocks, long size,
	305	const double* wtab,
	306	const double* wqinvtab,
	307	double q)
	308	{
	309	size /= 2;
	310
	311	do {
	312	pd_fft_layer_inner_loop(xp, xp+size, size, wtab, wqinvtab, q);
	313	xp += 2 * size;
	314	} while (--blocks != 0);
	315	}
	316
	317
	318	// size == 8*PDSZ
	319	static inline NTL_ALWAYS_INLINE void
	320	pd_fft_layer_size8(double* NTL_RESTRICT xp, long blocks,
	321	const double* NTL_RESTRICT wtab,
	322	const double* NTL_RESTRICT wqinvtab,
	323	double q)
	324	{
	325	do {
	326	fwd_butterfly_x4(xp+0PDSZ, xp+4PDSZ, wtab, q, wqinvtab);
	327	xp += 8*PDSZ;
	328	} while (--blocks != 0);
	329	}
	330
	331	// size == 4*PDSZ
	332	static inline NTL_ALWAYS_INLINE void
	333	pd_fft_layer_size4(double* NTL_RESTRICT xp, long blocks,
	334	const double* NTL_RESTRICT wtab,
	335	const double* NTL_RESTRICT wqinvtab,
	336	double q)
	337	{
	338	do {
	339	fwd_butterfly(xp+0PDSZ, xp+2PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	340	fwd_butterfly(xp+1PDSZ, xp+3PDSZ, wtab+1PDSZ, q, wqinvtab+1PDSZ);
	341
	342	fwd_butterfly(xp+4PDSZ, xp+6PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	343	fwd_butterfly(xp+5PDSZ, xp+7PDSZ, wtab+1PDSZ, q, wqinvtab+1PDSZ);
	344
	345	xp += 8*PDSZ;
	346	blocks -= 2;
	347	} while (blocks != 0);
	348	}
	349
	350	// size == 2*PDSZ
	351	static inline NTL_ALWAYS_INLINE void
	352	pd_fft_layer_size2(double* NTL_RESTRICT xp, long blocks,
	353	const double* NTL_RESTRICT wtab,
	354	const double* NTL_RESTRICT wqinvtab,
	355	double q)
	356	{
	357	do {
	358	fwd_butterfly(xp+0PDSZ, xp+1PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	359	fwd_butterfly(xp+2PDSZ, xp+3PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	360	fwd_butterfly(xp+4PDSZ, xp+5PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	361	fwd_butterfly(xp+6PDSZ, xp+7PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	362
	363	xp += 8*PDSZ;
	364	blocks -= 4;
	365	} while (blocks != 0);
	366	}
	367
	368	#if (PDSZ == 8)
	369	static inline NTL_ALWAYS_INLINE void
	370	pd_fft_layer_size1_one_block(double* x,
	371	pd_half w8, pd_half w8qinv,
	372	pd_full w4, pd_full w4qinv,
	373	double q)
	374	{
	375	pd_half x0 = pd_half::load(x);
	376	pd_half x1 = pd_half::load(x+PDSZ/2);
	377	fwd_butterfly_half(x0, x1, w8, q, w8qinv);
	378	pd_full y = join(x0, x1);
	379
	380	y = pd_fwd_butterfly_packed4(y, w4, q, w4qinv);
	381	y = pd_fwd_butterfly_packed2(y, q);
	382
	383	store(x, y);
	384	}
	385
	386	// size == PDSZ == 8
	387	// processes last three levels, of size 8, 4, and 2.
	388	static inline NTL_ALWAYS_INLINE void
	389	pd_fft_layer_size1(double* xp, long blocks,
	390	const double w_pp, const double wqinv_pp,
	391	double q)
	392	{
	393	const double w8_ptr = w_pp;
	394	const double w8qinv_ptr = wqinv_pp;
	395
	396	const double w4_ptr = (w_pp-1);
	397	const double w4qinv_ptr = (wqinv_pp-1);
	398
	399	pd_half w8 = pd_half::load(w8_ptr);
	400
	401	pd_half w8qinv = pd_half::load(w8qinv_ptr);
	402
	403
	404	pd_qrtr w4_qrtr = pd_qrtr::load(w4_ptr);
	405	pd_half w4_half = join(w4_qrtr, w4_qrtr);
	406	pd_full w4 = join(w4_half, w4_half);
	407	w4 = blend4(dup2even(w4), w4);
	408
	409	pd_qrtr w4qinv_qrtr = pd_qrtr::load(w4qinv_ptr);
	410	pd_half w4qinv_half = join(w4qinv_qrtr, w4qinv_qrtr);
	411	pd_full w4qinv = join(w4qinv_half, w4qinv_half);
	412	w4qinv = blend4(dup2even(w4qinv), w4qinv);
	413
	414	do {
	415	pd_fft_layer_size1_one_block(xp+0*PDSZ, w8, w8qinv, w4, w4qinv, q);
	416	pd_fft_layer_size1_one_block(xp+1*PDSZ, w8, w8qinv, w4, w4qinv, q);
	417	pd_fft_layer_size1_one_block(xp+2*PDSZ, w8, w8qinv, w4, w4qinv, q);
	418	pd_fft_layer_size1_one_block(xp+3*PDSZ, w8, w8qinv, w4, w4qinv, q);
	419
	420	xp += 4*PDSZ;
	421	blocks -= 4;
	422	} while (blocks != 0);
	423	}
	424	#else
	425	// PDSZ == 4
	426
	427	static inline NTL_ALWAYS_INLINE void
	428	pd_fft_layer_size1_one_block(double* x,
	429	pd_half w4, pd_half w4qinv,
	430	double q)
	431	{
	432	pd_half x0 = pd_half::load(x);
	433	pd_half x1 = pd_half::load(x+PDSZ/2);
	434	fwd_butterfly_half(x0, x1, w4, q, w4qinv);
	435	pd_full y = join(x0, x1);
	436
	437	y = pd_fwd_butterfly_packed2(y, q);
	438
	439	store(x, y);
	440	}
	441
	442	// size == PDSZ == 4
	443	// processes last two levels, of size 4 and 2.
	444	static inline NTL_ALWAYS_INLINE void
	445	pd_fft_layer_size1(double* xp, long blocks,
	446	const double w_pp, const double wqinv_pp,
	447	double q)
	448	{
	449	const double w4_ptr = w_pp;
	450	const double w4qinv_ptr = wqinv_pp;
	451
	452
	453	pd_half w4 = pd_half::load(w4_ptr);
	454	pd_half w4qinv = pd_half::load(w4qinv_ptr);
	455
	456
	457	do {
	458	pd_fft_layer_size1_one_block(xp+0*PDSZ, w4, w4qinv, q);
	459	pd_fft_layer_size1_one_block(xp+1*PDSZ, w4, w4qinv, q);
	460	pd_fft_layer_size1_one_block(xp+2*PDSZ, w4, w4qinv, q);
	461	pd_fft_layer_size1_one_block(xp+3*PDSZ, w4, w4qinv, q);
	462
	463	xp += 4*PDSZ;
	464	blocks -= 4;
	465	} while (blocks != 0);
	466	}
	467
	468	#endif
	469
	470
	471
	472
	473	void
	474	pd_fft_base(double* xp, long lgN, const pd_mod_t& mod)
	475	{
	476	double q = mod.q;
	477	const double** wtab = mod.wtab;
	478	const double** wqinvtab = mod.wqinvtab;
	479
	480	long N = 1L << lgN;
	481
	482	long j, size, blocks;
	483	for (j = lgN, size = N, blocks = 1;
	484	size > 8*PDSZ; j--, blocks <<= 1, size >>= 1)
	485	pd_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
	486
	487	pd_fft_layer_size8(xp, blocks, wtab[j], wqinvtab[j], q);
	488	j--, blocks <<= 1, size >>= 1;
	489
	490	pd_fft_layer_size4(xp, blocks, wtab[j], wqinvtab[j], q);
	491	j--, blocks <<= 1, size >>= 1;
	492
	493	pd_fft_layer_size2(xp, blocks, wtab[j], wqinvtab[j], q);
	494	j--, blocks <<= 1, size >>= 1;
	495
	496	pd_fft_layer_size1(xp, blocks, wtab+j, wqinvtab+j, q);
	497
	498	}
	499
	500	static inline NTL_ALWAYS_INLINE void
	501	pd_move(double x, const long a)
	502	{
	503	pd_full r;
	504	loadu(r, a);
	505	store(x, r);
	506	}
	507
	508	static inline NTL_ALWAYS_INLINE void
	509	pd_move(long x, const double a)
	510	{
	511	pd_full r;
	512	load(r, a);
	513	storeu(x, r);
	514	}
	515
	516	static inline NTL_ALWAYS_INLINE void
	517	pd_reduce1_move(long x, const double a, double q)
	518	{
	519	pd_full r;
	520	load(r, a);
	521	r = pd_LazyReduce1(r, q);
	522	storeu(x, r);
	523	}
	524
	525	static inline NTL_ALWAYS_INLINE void
	526	pd_reduce2_move(long x, const double a, double q)
	527	{
	528	pd_full r;
	529	load(r, a);
	530	r = pd_LazyReduce2(r, q);
	531	r = pd_LazyReduce1(r, q);
	532	storeu(x, r);
	533	}
	534
	535	static inline NTL_ALWAYS_INLINE void
	536	pd_mul_move(long x, const double a, pd_full b, double q, pd_full bqinv)
	537	{
	538	pd_full r;
	539	load(r, a);
	540	r = pd_LazyMulModPrecon(r, b, q, bqinv);
	541	r = pd_LazyReduce1(r, q);
	542	storeu(x, r);
	543	}
	544
	545
	546
	547
	548	static
	549	void pd_fft_short(double* xp, long yn, long xn, long lgN,
	550	const pd_mod_t& mod)
	551	{
	552	long N = 1L << lgN;
	553
	554	if (yn == N)
	555	{
	556	if (xn == N && lgN <= NTL_PD_FFT_THRESH)
	557	{
	558	// no truncation
	559	pd_fft_base(xp, lgN, mod);
	560	return;
	561	}
	562	}
	563
	564	// divide-and-conquer algorithm
	565
	566	long half = N >> 1;
	567	double q = mod.q;
	568
	569	if (yn <= half)
	570	{
	571	if (xn <= half)
	572	{
	573	pd_fft_short(xp, yn, xn, lgN - 1, mod);
	574	}
	575	else
	576	{
	577	xn -= half;
	578
	579	// (X, Y) -> X + Y
	580	for (long j = 0; j < xn; j+=PDSZ)
	581	store(xp+j, pd_LazyAddMod2(PDLD(xp+j), PDLD(xp+j+half), q));
	582
	583	pd_fft_short(xp, yn, half, lgN - 1, mod);
	584	}
	585	}
	586	else
	587	{
	588	yn -= half;
	589
	590	double* xp0 = xp;
	591	double* xp1 = xp + half;
	592	const double* wtab = mod.wtab[lgN];
	593	const double* wqinvtab = mod.wqinvtab[lgN];
	594
	595	if (xn <= half)
	596	{
	597	// X -> (X, w*X)
	598	for (long j = 0; j < xn; j+=PDSZ)
	599	store(xp1+j, pd_LazyMulModPrecon(PDLD(xp0+j), PDLD(wtab+j), q, PDLD(wqinvtab+j)));
	600
	601	pd_fft_short(xp0, half, xn, lgN - 1, mod);
	602	pd_fft_short(xp1, yn, xn, lgN - 1, mod);
	603	}
	604	else
	605	{
	606	xn -= half;
	607
	608	// (X, Y) -> (X + Y, w*(X - Y))
	609	pd_fft_layer_inner_loop(xp0, xp1, xn, wtab, wqinvtab, q);
	610
	611	// X -> (X, w*X)
	612	for (long j = xn; j < half; j+=PDSZ)
	613	store(xp1+j, pd_LazyMulModPrecon(PDLD(xp0+j), PDLD(wtab+j), q, PDLD(wqinvtab+j)));
	614
	615	pd_fft_short(xp0, half, half, lgN - 1, mod);
	616	pd_fft_short(xp1, yn, half, lgN - 1, mod);
	617	}
	618	}
	619	}
	620
	621
	622	void
	623	pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	624	long yn, long xn)
	625
	626	{
	627	for (long i = 0; i < xn; i += 4*PDSZ) {
	628	pd_move(xp+i+0PDSZ, a+i+0PDSZ);
	629	pd_move(xp+i+1PDSZ, a+i+1PDSZ);
	630	pd_move(xp+i+2PDSZ, a+i+2PDSZ);
	631	pd_move(xp+i+3PDSZ, a+i+3PDSZ);
	632	}
	633
	634	pd_fft_short(xp, yn, xn, lgN, mod);
	635
	636	double q = mod.q;
	637	for (long i = 0; i < yn; i += 4*PDSZ) {
	638	pd_reduce1_move(A+i+0PDSZ, xp+i+0PDSZ, q);
	639	pd_reduce1_move(A+i+1PDSZ, xp+i+1PDSZ, q);
	640	pd_reduce1_move(A+i+2PDSZ, xp+i+2PDSZ, q);
	641	pd_reduce1_move(A+i+3PDSZ, xp+i+3PDSZ, q);
	642	}
	643	}
	644
	645
	646	void
	647	pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	648	long yn, long xn, double fac)
	649
	650	{
	651	for (long i = 0; i < xn; i += 4*PDSZ) {
	652	pd_move(xp+i+0PDSZ, a+i+0PDSZ);
	653	pd_move(xp+i+1PDSZ, a+i+1PDSZ);
	654	pd_move(xp+i+2PDSZ, a+i+2PDSZ);
	655	pd_move(xp+i+3PDSZ, a+i+3PDSZ);
	656	}
	657
	658	pd_fft_short(xp, yn, xn, lgN, mod);
	659
	660	double q = mod.q;
	661	double facqinv = fac/q;
	662	for (long i = 0; i < yn; i += 4*PDSZ) {
	663	pd_mul_move(A+i+0PDSZ, xp+i+0PDSZ, fac, q, facqinv);
	664	pd_mul_move(A+i+1PDSZ, xp+i+1PDSZ, fac, q, facqinv);
	665	pd_mul_move(A+i+2PDSZ, xp+i+2PDSZ, fac, q, facqinv);
	666	pd_mul_move(A+i+3PDSZ, xp+i+3PDSZ, fac, q, facqinv);
	667	}
	668	}
	669
	670	//================ ifft ==============
	671
	672	// return (a[0] + a[1], a[0] - a[1], a[2] + a[3], a[2] - a[3], ...)
	673	// all inputs and outputs in [0, 4*n)
	674	template<class pd> pd
	675	pd_inv_butterfly_packed2(pd a, double n)
	676	{
	677	a = pd_LazyReduce2(a, n);
	678	pd b = swap2(a);
	679	pd sum = pd_LazyAddMod(a, b, n);
	680	pd diff = pd_LazySubMod(b, a, n);
	681	pd res = blend2(sum, diff);
	682	return res;
	683	}
	684
	685	// return (a[0] + a[2], a[1] + a[3]root, a[0] - a[2], a[1] - a[3]root, ...)
	686	// all inputs and outputs in [0, 4*n)
	687	// it is also assumed that w = (1,1,1,root,...) and wninv = RoundDown(w/n)
	688	template<class pd> pd
	689	pd_inv_butterfly_packed4(pd a, pd w, double n, pd wninv)
	690	{
	691	a = pd_LazyMulModPrecon(a, w, n, wninv);
	692	pd b = swap4(a);
	693	pd sum = pd_LazyAddMod(a, b, n);
	694	pd diff = pd_LazySubMod(b, a, n);
	695	pd res = blend4(sum, diff);
	696	return res;
	697	}
	698
	699	#define inv_butterfly_half(xx0, xx1, w, q, wqinv) \
	700	do \
	701	{ \
	702	pd_half x0_ = pd_LazyReduce2(xx0, q); \
	703	pd_half x1_ = xx1; \
	704	pd_half t_ = pd_LazyMulModPrecon(x1_, w, q, wqinv); \
	705	xx0 = pd_LazyAddMod(x0_, t_, q); \
	706	xx1 = pd_LazySubMod(x0_, t_, q); \
	707	} while (0)
	708
	709
	710	#define inv_butterfly(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
	711	do \
	712	{ \
	713	pd_full x0_ = pd_LazyReduce2(PDLD(xx0_ptr), q); \
	714	pd_full x1_ = PDLD(xx1_ptr); \
	715	pd_full t_ = pd_LazyMulModPrecon(x1_, PDLD(w_ptr), q, PDLD(wqinv_ptr)); \
	716	store(xx0_ptr, pd_LazyAddMod(x0_, t_, q)); \
	717	store(xx1_ptr, pd_LazySubMod(x0_, t_, q)); \
	718	} while (0)
	719
	720	#define inv_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
	721	do \
	722	{ \
	723	inv_butterfly(xx0_ptr+0PDSZ, xx1_ptr+0PDSZ, w_ptr+0PDSZ, q, wqinv_ptr+0PDSZ); \
	724	inv_butterfly(xx0_ptr+1PDSZ, xx1_ptr+1PDSZ, w_ptr+1PDSZ, q, wqinv_ptr+1PDSZ); \
	725	inv_butterfly(xx0_ptr+2PDSZ, xx1_ptr+2PDSZ, w_ptr+2PDSZ, q, wqinv_ptr+2PDSZ); \
	726	inv_butterfly(xx0_ptr+3PDSZ, xx1_ptr+3PDSZ, w_ptr+3PDSZ, q, wqinv_ptr+3PDSZ); \
	727	} \
	728	while(0)
	729
	730	static inline NTL_ALWAYS_INLINE void
	731	pd_ifft_layer_inner_loop(double* NTL_RESTRICT xp0,
	732	double* NTL_RESTRICT xp1,
	733	long size,
	734	const double* NTL_RESTRICT wtab,
	735	const double* NTL_RESTRICT wqinvtab,
	736	double q)
	737
	738	{
	739	long j = 0;
	740	do {
	741	inv_butterfly_x4(xp0+j, xp1+j, wtab+j, q, wqinvtab+j);
	742	j += 4*PDSZ;
	743	} while (j < size);
	744	}
	745
	746	// assumes size >= 8*PDSZ
	747	static inline NTL_ALWAYS_INLINE void
	748	pd_ifft_layer(double* xp, long blocks, long size,
	749	const double* wtab,
	750	const double* wqinvtab,
	751	double q)
	752	{
	753	size /= 2;
	754
	755	do {
	756	pd_ifft_layer_inner_loop(xp, xp+size, size, wtab, wqinvtab, q);
	757	xp += 2 * size;
	758	} while (--blocks != 0);
	759	}
	760
	761	// size == 8*PDSZ
	762	static inline NTL_ALWAYS_INLINE void
	763	pd_ifft_layer_size8(double* NTL_RESTRICT xp, long blocks,
	764	const double* NTL_RESTRICT wtab,
	765	const double* NTL_RESTRICT wqinvtab,
	766	double q)
	767	{
	768	do {
	769	inv_butterfly_x4(xp+0PDSZ, xp+4PDSZ, wtab, q, wqinvtab);
	770	xp += 8*PDSZ;
	771	} while (--blocks != 0);
	772	}
	773
	774	// size == 4*PDSZ
	775	static inline NTL_ALWAYS_INLINE void
	776	pd_ifft_layer_size4(double* NTL_RESTRICT xp, long blocks,
	777	const double* NTL_RESTRICT wtab,
	778	const double* NTL_RESTRICT wqinvtab,
	779	double q)
	780	{
	781	do {
	782	inv_butterfly(xp+0PDSZ, xp+2PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	783	inv_butterfly(xp+1PDSZ, xp+3PDSZ, wtab+1PDSZ, q, wqinvtab+1PDSZ);
	784
	785	inv_butterfly(xp+4PDSZ, xp+6PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	786	inv_butterfly(xp+5PDSZ, xp+7PDSZ, wtab+1PDSZ, q, wqinvtab+1PDSZ);
	787
	788	xp += 8*PDSZ;
	789	blocks -= 2;
	790	} while (blocks != 0);
	791	}
	792
	793	// size == 2*PDSZ
	794	static inline NTL_ALWAYS_INLINE void
	795	pd_ifft_layer_size2(double* NTL_RESTRICT xp, long blocks,
	796	const double* NTL_RESTRICT wtab,
	797	const double* NTL_RESTRICT wqinvtab,
	798	double q)
	799	{
	800	do {
	801	inv_butterfly(xp+0PDSZ, xp+1PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	802	inv_butterfly(xp+2PDSZ, xp+3PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	803	inv_butterfly(xp+4PDSZ, xp+5PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	804	inv_butterfly(xp+6PDSZ, xp+7PDSZ, wtab+0PDSZ, q, wqinvtab+0PDSZ);
	805
	806	xp += 8*PDSZ;
	807	blocks -= 4;
	808	} while (blocks != 0);
	809	}
	810
	811	#if (PDSZ == 8)
	812	static inline NTL_ALWAYS_INLINE void
	813	pd_ifft_layer_size1_one_block(double* x,
	814	pd_half w8, pd_half w8qinv,
	815	pd_full w4, pd_full w4qinv,
	816	double q)
	817	{
	818	pd_full y = PDLD(x);
	819	y = pd_inv_butterfly_packed2(y, q);
	820	y = pd_inv_butterfly_packed4(y, w4, q, w4qinv);
	821
	822	pd_half x0 = get_lo(y);
	823	pd_half x1 = get_hi(y);
	824	inv_butterfly_half(x0, x1, w8, q, w8qinv);
	825
	826	store(x, x0);
	827	store(x+PDSZ/2, x1);
	828	}
	829
	830	// size == PDSZ == 8
	831	// processes last three levels, of size 8, 4, and 2.
	832	static inline NTL_ALWAYS_INLINE void
	833	pd_ifft_layer_size1(double* xp, long blocks,
	834	const double w_pp, const double wqinv_pp,
	835	double q)
	836	{
	837	const double w8_ptr = w_pp;
	838	const double w8qinv_ptr = wqinv_pp;
	839
	840	const double w4_ptr = (w_pp-1);
	841	const double w4qinv_ptr = (wqinv_pp-1);
	842
	843	pd_half w8 = pd_half::load(w8_ptr);
	844
	845	pd_half w8qinv = pd_half::load(w8qinv_ptr);
	846
	847
	848	pd_qrtr w4_qrtr = pd_qrtr::load(w4_ptr);
	849	pd_half w4_half = join(w4_qrtr, w4_qrtr);
	850	pd_full w4 = join(w4_half, w4_half);
	851	w4 = blend4(dup2even(w4), w4);
	852
	853	pd_qrtr w4qinv_qrtr = pd_qrtr::load(w4qinv_ptr);
	854	pd_half w4qinv_half = join(w4qinv_qrtr, w4qinv_qrtr);
	855	pd_full w4qinv = join(w4qinv_half, w4qinv_half);
	856	w4qinv = blend4(dup2even(w4qinv), w4qinv);
	857
	858	do {
	859	pd_ifft_layer_size1_one_block(xp+0*PDSZ, w8, w8qinv, w4, w4qinv, q);
	860	pd_ifft_layer_size1_one_block(xp+1*PDSZ, w8, w8qinv, w4, w4qinv, q);
	861	pd_ifft_layer_size1_one_block(xp+2*PDSZ, w8, w8qinv, w4, w4qinv, q);
	862	pd_ifft_layer_size1_one_block(xp+3*PDSZ, w8, w8qinv, w4, w4qinv, q);
	863
	864	xp += 4*PDSZ;
	865	blocks -= 4;
	866	} while (blocks != 0);
	867	}
	868	#else
	869	// PDSZ == 4
	870
	871	static inline NTL_ALWAYS_INLINE void
	872	pd_ifft_layer_size1_one_block(double* x,
	873	pd_half w4, pd_half w4qinv,
	874	double q)
	875	{
	876	pd_full y = PDLD(x);
	877	y = pd_inv_butterfly_packed2(y, q);
	878
	879	pd_half x0 = get_lo(y);
	880	pd_half x1 = get_hi(y);
	881	inv_butterfly_half(x0, x1, w4, q, w4qinv);
	882
	883	store(x, x0);
	884	store(x+PDSZ/2, x1);
	885	}
	886
	887	// size == PDSZ == 4
	888	// processes last two levels, of size 4 and 2.
	889	static inline NTL_ALWAYS_INLINE void
	890	pd_ifft_layer_size1(double* xp, long blocks,
	891	const double w_pp, const double wqinv_pp,
	892	double q)
	893	{
	894	const double w4_ptr = w_pp;
	895	const double w4qinv_ptr = wqinv_pp;
	896
	897
	898	pd_half w4 = pd_half::load(w4_ptr);
	899	pd_half w4qinv = pd_half::load(w4qinv_ptr);
	900
	901
	902	do {
	903	pd_ifft_layer_size1_one_block(xp+0*PDSZ, w4, w4qinv, q);
	904	pd_ifft_layer_size1_one_block(xp+1*PDSZ, w4, w4qinv, q);
	905	pd_ifft_layer_size1_one_block(xp+2*PDSZ, w4, w4qinv, q);
	906	pd_ifft_layer_size1_one_block(xp+3*PDSZ, w4, w4qinv, q);
	907
	908	xp += 4*PDSZ;
	909	blocks -= 4;
	910	} while (blocks != 0);
	911	}
	912
	913	#endif
	914
	915	void
	916	pd_ifft_base(double* xp, long lgN, const pd_mod_t& mod)
	917	{
	918	double q = mod.q;
	919	const double** wtab = mod.wtab;
	920	const double** wqinvtab = mod.wqinvtab;
	921
	922	long N = 1L << lgN;
	923
	924	long j=PDLGSZ, size=PDSZ, blocks=N/PDSZ;
	925
	926	pd_ifft_layer_size1(xp, blocks, wtab+j, wqinvtab+j, q);
	927	j++, blocks >>= 1, size <<= 1;
	928
	929	pd_ifft_layer_size2(xp, blocks, wtab[j], wqinvtab[j], q);
	930	j++, blocks >>= 1, size <<= 1;
	931
	932	pd_ifft_layer_size4(xp, blocks, wtab[j], wqinvtab[j], q);
	933	j++, blocks >>= 1, size <<= 1;
	934
	935	pd_ifft_layer_size8(xp, blocks, wtab[j], wqinvtab[j], q);
	936	j++, blocks >>= 1, size <<= 1;
	937
	938	for (; size <= N; j++, blocks >>= 1, size <<= 1)
	939	pd_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
	940
	941	}
	942
	943	static void
	944	pd_ifft_short2(double* xp, long yn, long lgN, const pd_mod_t& mod);
	945
	946
	947	static void
	948	pd_ifft_short1(double* xp, long yn, long lgN, const pd_mod_t& mod)
	949
	950	// Implements truncated inverse FFT interface, but with xn==yn.
	951	// All computations are done in place.
	952
	953	{
	954	long N = 1L << lgN;
	955
	956	if (yn == N && lgN <= NTL_PD_FFT_THRESH)
	957	{
	958	// no truncation
	959	pd_ifft_base(xp, lgN, mod);
	960	return;
	961	}
	962
	963	// divide-and-conquer algorithm
	964
	965	long half = N >> 1;
	966	double q = mod.q;
	967
	968	if (yn <= half)
	969	{
	970	// X -> 2X
	971	for (long j = 0; j < yn; j+=PDSZ)
	972	store(xp+j, pd_LazyDoubleMod4(PDLD(xp+j), q));
	973
	974	pd_ifft_short1(xp, yn, lgN - 1, mod);
	975	}
	976	else
	977	{
	978	double* xp0 = xp;
	979	double* xp1 = xp + half;
	980
	981	pd_ifft_short1(xp0, half, lgN - 1, mod);
	982
	983	yn -= half;
	984
	985	if (yn < half) {
	986	const double* wtab1 = mod.wtab1[lgN];
	987	const double* wqinvtab1 = mod.wqinvtab1[lgN];
	988
	989	// X -> (2X, w*X)
	990	for (long j = yn; j < half; j+=PDSZ)
	991	{
	992	pd_full x0 = PDLD(xp0+j);
	993	store(xp0+j, pd_LazyDoubleMod4(x0, q));
	994	store(xp1+j, pd_LazyMulModPrecon(x0, PDLD(wtab1+j), q, PDLD(wqinvtab1+j)));
	995	}
	996	}
	997
	998	pd_ifft_short2(xp1, yn, lgN - 1, mod);
	999
	1000	// (X, Y) -> (X + Y/w, X - Y/w)
	1001	pd_ifft_layer_inner_loop(xp0, xp1, yn, mod.wtab[lgN], mod.wqinvtab[lgN], q);
	1002	}
	1003	}
	1004
	1005
	1006	static void
	1007	pd_ifft_short2(double* xp, long yn, long lgN, const pd_mod_t& mod)
	1008
	1009	// Implements truncated inverse FFT interface, but with xn==N.
	1010	// All computations are done in place.
	1011
	1012	{
	1013	long N = 1L << lgN;
	1014
	1015	if (yn == N && lgN <= NTL_PD_FFT_THRESH)
	1016	{
	1017	// no truncation
	1018	pd_ifft_base(xp, lgN, mod);
	1019	return;
	1020	}
	1021
	1022	// divide-and-conquer algorithm
	1023
	1024	long half = N >> 1;
	1025	double q = mod.q;
	1026
	1027	if (yn <= half)
	1028	{
	1029	// X -> 2X
	1030	for (long j = 0; j < yn; j+=PDSZ)
	1031	store(xp+j, pd_LazyDoubleMod4(PDLD(xp+j), q));
	1032
	1033	// (X, Y) -> X + Y
	1034	for (long j = yn; j < half; j+=PDSZ)
	1035	store(xp+j, pd_LazyAddMod4(PDLD(xp+j), PDLD(xp+j+half), q));
	1036
	1037	pd_ifft_short2(xp, yn, lgN - 1, mod);
	1038
	1039	// (X, Y) -> X - Y
	1040	for (long j = 0; j < yn; j+=PDSZ)
	1041	store(xp+j, pd_LazySubMod4(PDLD(xp+j), PDLD(xp+j+half), q));
	1042	}
	1043	else
	1044	{
	1045	double* xp0 = xp;
	1046	double* xp1 = xp + half;
	1047
	1048	pd_ifft_short1(xp0, half, lgN - 1, mod);
	1049
	1050	yn -= half;
	1051
	1052
	1053	if (yn < half) {
	1054	const double* wtab1 = mod.wtab1[lgN];
	1055	const double* wqinvtab1 = mod.wqinvtab1[lgN];
	1056
	1057	// (X, Y) -> (2X - Y, w*(X - Y))
	1058	for (long j = yn; j < half; j+=PDSZ)
	1059	{
	1060	pd_full x0 = PDLD(xp0+j);
	1061	pd_full x1 = PDLD(xp1+j);
	1062	pd_full u = pd_LazySubMod4(x0, x1, q);
	1063	store(xp0+j, pd_LazyAddMod4(x0, u, q));
	1064	store(xp1+j, pd_LazyMulModPrecon(u, PDLD(wtab1+j), q, PDLD(wqinvtab1+j)));
	1065	}
	1066	}
	1067
	1068	pd_ifft_short2(xp1, yn, lgN - 1, mod);
	1069
	1070	// (X, Y) -> (X + Y/w, X - Y/w)
	1071	pd_ifft_layer_inner_loop(xp0, xp1, yn, mod.wtab[lgN], mod.wqinvtab[lgN], q);
	1072	}
	1073	}
	1074
	1075
	1076	void
	1077	pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	1078	long yn, double fac)
	1079	{
	1080	long N = 1L << lgN;
	1081
	1082	for (long i = 0; i < yn; i += 4*PDSZ) {
	1083	pd_move(xp+i+0PDSZ, a+i+0PDSZ);
	1084	pd_move(xp+i+1PDSZ, a+i+1PDSZ);
	1085	pd_move(xp+i+2PDSZ, a+i+2PDSZ);
	1086	pd_move(xp+i+3PDSZ, a+i+3PDSZ);
	1087	}
	1088
	1089	pd_ifft_short1(xp, yn, lgN, mod);
	1090
	1091	double q = mod.q;
	1092	double facqinv = fac/q;
	1093	for (long i = 0; i < yn; i += 4*PDSZ) {
	1094	pd_mul_move(A+i+0PDSZ, xp+i+0PDSZ, fac, q, facqinv);
	1095	pd_mul_move(A+i+1PDSZ, xp+i+1PDSZ, fac, q, facqinv);
	1096	pd_mul_move(A+i+2PDSZ, xp+i+2PDSZ, fac, q, facqinv);
	1097	pd_mul_move(A+i+3PDSZ, xp+i+3PDSZ, fac, q, facqinv);
	1098	}
	1099	}
	1100
	1101
	1102	void
	1103	pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
	1104	long yn)
	1105	{
	1106	long N = 1L << lgN;
	1107
	1108	for (long i = 0; i < yn; i += 4*PDSZ) {
	1109	pd_move(xp+i+0PDSZ, a+i+0PDSZ);
	1110	pd_move(xp+i+1PDSZ, a+i+1PDSZ);
	1111	pd_move(xp+i+2PDSZ, a+i+2PDSZ);
	1112	pd_move(xp+i+3PDSZ, a+i+3PDSZ);
	1113	}
	1114
	1115	pd_ifft_short1(xp, yn, lgN, mod);
	1116
	1117	double q = mod.q;
	1118	for (long i = 0; i < yn; i += 4*PDSZ) {
	1119	pd_reduce2_move(A+i+0PDSZ, xp+i+0PDSZ, q);
	1120	pd_reduce2_move(A+i+1PDSZ, xp+i+1PDSZ, q);
	1121	pd_reduce2_move(A+i+2PDSZ, xp+i+2PDSZ, q);
	1122	pd_reduce2_move(A+i+3PDSZ, xp+i+3PDSZ, q);
	1123	}
	1124	}
	1125
	1126	NTL_END_IMPL
	1127
	1128	#endif

+80

-600

src/quad_float.cpp less more

48	48	#endif
49	49
50	50	#include <NTL/quad_float.h>
51		#include <NTL/RR.h>
52
53	51	#include <cfloat>
54	52
55	53

81	79
82	80
83	81	#define START_FIX \
84		volatile unsigned short __old_cw, __new_cw; \
85		asm volatile ("fnstcw %0":"=m" (__old_cw)); \
	82	unsigned short __old_cw, __new_cw; \
	83	__asm__ volatile ("fnstcw %0":"=m" (__old_cw)::"memory"); \
86	84	__new_cw = (__old_cw & ~0x300) \| 0x200; \
87		asm volatile ("fldcw %0": :"m" (__new_cw));
88
89
90		#define END_FIX asm volatile ("fldcw %0": :"m" (__old_cw));
	85	__asm__ volatile ("fldcw %0"::"m" (__new_cw):"memory");
	86
	87
	88	#define END_FIX __asm__ volatile ("fldcw %0": :"m" (__old_cw));
	89
	90	// NOTE: "asm volatile" does not guarantee that the asm does
	91	// not move. However, the "memory" clobber makes these
	92	// memory barriers that cannot move past a load/store
	93
	94	#define NO_INLINE __attribute__ ((noinline))
	95	// to protect against LTO inlining which could break the memory
	96	// barriers in START_FIX and END_FIX. I've done some testing
	97	// on gcc, clang, and icc. The noinline attribute and the volatile
	98	// asm together should ensure that the function gets called
	99	// and doesn't get inlined during LTO.
	100	// That said, I wouln't really recommend applying LTO to NTL...
	101	// and especially to quad_float.cpp.
	102
	103
	104	// NOTE: gcc 8.1 seems a bit buggy: it warns when overloading a function
	105	// with different inline atrributes. Earlier versions are fine.
	106	// ICC and CLANG are fine.
	107
	108	// NOTE: starting with gcc 8.1, there is a function attribute called
	109	// "noipa" which really does exactly what I want. It would also be useful
	110	// for ForceToMem, for example.
91	111
92	112	#else
93	113
94	114	#define START_FIX
95	115	#define END_FIX
96	116
97		#endif
98
99
100		static
101		void normalize(quad_float& z, const double& xhi, const double& xlo)
	117	#define NO_INLINE
	118
	119	#endif
	120
	121
	122
	123
	124	NO_INLINE void quad_float_normalize(quad_float& z, const double& xhi, const double& xlo)
102	125	{
103	126	START_FIX
104	127	DOUBLE u, v;

112	135	END_FIX
113	136	}
114	137
115
116
117		#if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
118
119
120		quad_float to_quad_float(long n)
121		{
122		DOUBLE xhi, xlo;
123
124		xhi = TrueDouble(n);
125
126		// Because we are assuming 2's compliment integer
127		// arithmetic, the following prevents long(xhi) from overflowing.
128
129		if (n > 0)
130		xlo = TrueDouble(n+long(-xhi));
131		else
132		xlo = TrueDouble(n-long(xhi));
133
134		// renormalize...just to be safe
135
136		quad_float z;
137		normalize(z, xhi, xlo);
138		return z;
139		}
140
141		quad_float to_quad_float(unsigned long n)
142		{
143		DOUBLE xhi, xlo, t;
144
145		const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
146
147		xhi = TrueDouble(n);
148
149		if (xhi >= bnd)
150		t = xhi - bnd;
151		else
152		t = xhi;
153
154		// we use the "to_long" function here to be as portable as possible.
155		long llo = to_long(n - (unsigned long)(t));
156		xlo = TrueDouble(llo);
157
158		quad_float z;
159		normalize(z, xhi, xlo);
160		return z;
161		}
162		#endif
163
164
165		NTL_CHEAP_THREAD_LOCAL
166		long quad_float::oprec = 10;
167
168		void quad_float::SetOutputPrecision(long p)
169		{
170		if (p < 1) p = 1;
171
172		if (NTL_OVERFLOW(p, 1, 0))
173		ResourceError("quad_float: output precision too big");
174
175		oprec = p;
176		}
177
178
179		quad_float operator +(const quad_float& x, const quad_float& y ) {
	138	NO_INLINE void quad_float_in_place_add(quad_float& x, const quad_float& y ) {
180	139	START_FIX
181	140	DOUBLE H, h, T, t, S, s, e, f;
182	141	DOUBLE t1;

206	165	e = H + h;
207	166	f = H - e;
208	167	f = f + h;
209		END_FIX
210		return quad_float(e, f);
211		}
212
213		quad_float& operator +=(quad_float& x, const quad_float& y ) {
214		START_FIX
215		DOUBLE H, h, T, t, S, s, e, f;
216		DOUBLE t1;
217
218		S = x.hi + y.hi;
219		T = x.lo + y.lo;
220		e = S - x.hi;
221		f = T - x.lo;
222
223		t1 = S-e;
224		t1 = x.hi-t1;
225		s = y.hi-e;
226		s = s + t1;
227
228		t1 = T-f;
229		t1 = x.lo-t1;
230		t = y.lo-f;
231		t = t + t1;
232
233
234		s = s + T;
235		H = S + s;
236		h = S - H;
237		h = h + s;
238
239		h = h + t;
240		e = H + h;
241		f = H - e;
242		f = f + h;
243	168
244	169	x.hi = e;
245	170	x.lo = f;
246	171	END_FIX
247		return x;
248		}
249
250		quad_float operator -(const quad_float& x, const quad_float& y ) {
	172	}
	173
	174
	175	NO_INLINE void quad_float_in_place_sub(quad_float& x, const quad_float& y ) {
251	176	START_FIX
252	177	DOUBLE H, h, T, t, S, s, e, f;
253	178	DOUBLE t1, yhi, ylo;

281	206	f = H - e;
282	207	f = f + h;
283	208
284		END_FIX
285		return quad_float(e, f);
286		}
287
288		quad_float& operator -=(quad_float& x, const quad_float& y ) {
289		START_FIX
290		DOUBLE H, h, T, t, S, s, e, f;
291		DOUBLE t1, yhi, ylo;
292
293		yhi = -y.hi;
294		ylo = -y.lo;
295
296		S = x.hi + yhi;
297		T = x.lo + ylo;
298		e = S - x.hi;
299		f = T - x.lo;
300
301		t1 = S-e;
302		t1 = x.hi-t1;
303		s = yhi-e;
304		s = s + t1;
305
306		t1 = T-f;
307		t1 = x.lo-t1;
308		t = ylo-f;
309		t = t + t1;
310
311
312		s = s + T;
313		H = S + s;
314		h = S - H;
315		h = h + s;
316
317		h = h + t;
318		e = H + h;
319		f = H - e;
320		f = f + h;
321
322	209	x.hi = e;
323	210	x.lo = f;
324	211	END_FIX
325		return x;
326		}
327
328		quad_float operator -(const quad_float& x)
	212	}
	213
	214	NO_INLINE void quad_float_in_place_negate(quad_float& x)
329	215	{
330	216	START_FIX
331	217	DOUBLE xhi, xlo, u, v;

341	227	v = xhi - u;
342	228	v = v + xlo;
343	229
344		END_FIX
345		return quad_float(u, v);
	230	x.hi = u;
	231	x.lo = v;
	232	END_FIX
346	233	}
347	234
348	235
349	236
350	237	#if (NTL_FMA_DETECTED && !defined(NTL_CONTRACTION_FIXED))
351	238
352		// The configure script should fix this issue for most
353		// compilers (at least gcc, clang, and icc), but if not,
354		// this is a last ditch effort to fix it (which seems to work).
	239
	240	// The configure script should ensure that no FMA's are issued
	241	// fo most compilers (at least gcc, clang, and icc), but if not,
	242	// this is a last ditch effort to fix the problem (which seems to work).
355	243
356	244	double quad_float_zero = 0;
357	245

367	255
368	256	#endif
369	257
370		// NOTE: this is really sick: some compilers will issue FMA
371		// (fused mul add) instructions which will break correctness.
372		// C99 standard is supposed to prevent this across separate
373		// statements, but C++ standard doesn't guarantee much at all.
374		// In any case, gcc does not even implement the C99 standard
375		// correctly. One could disable this by compiling with
376		// an appropriate flag: -mno-fma works for gcc, while -no-fma works
377		// for icc. icc and MSVC++ also support pragmas to do this:
378		// #pragma fp_contract(off). There is also a compiler flag for
379		// gcc: -ffp-contract=off, but -mno-fma seems more widely supported.
380		// These flags work for clang, as well.
381		//
382		// But in any case, I'd rather not mess with getting these flags right.
383		// By calling Protect(a*b), this has the effect of forcing the
384		// compiler to compute a*b + 0. Assuming the compiler otherwise
385		// does not perform any re-association, this should do the trick.
386		// There is a small performance penalty, but it should be reasonable.
387
388
389
390		quad_float operator *(const quad_float& x,const quad_float& y ) {
	258
	259
	260
	261	NO_INLINE void quad_float_in_place_mul(quad_float& x,const quad_float& y ) {
391	262	START_FIX
392	263	DOUBLE hx, tx, hy, ty, C, c;
393	264	DOUBLE t1, t2;

422	293	tx = C-hx;
423	294	tx = tx+c;
424	295
425		END_FIX
426		return quad_float(hx, tx);
427		}
428
429		quad_float& operator *=(quad_float& x,const quad_float& y ) {
430		START_FIX
431		DOUBLE hx, tx, hy, ty, C, c;
432		DOUBLE t1, t2;
433
434		C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
435		hx = C-x.hi;
436		c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
437		hx = C-hx;
438		tx = x.hi-hx;
439		hy = c-y.hi;
440		C = Protect(x.hi*y.hi);
441		hy = c-hy;
442		ty = y.hi-hy;
443
444		// c = ((((hxhy-C)+hxty)+txhy)+txty)+(x.hiy.lo+x.loy.hi);
445
446		t1 = Protect(hx*hy);
447		t1 = t1-C;
448		t2 = Protect(hx*ty);
449		t1 = t1+t2;
450		t2 = Protect(tx*hy);
451		t1 = t1+t2;
452		t2 = Protect(tx*ty);
453		c = t1+t2;
454		t1 = Protect(x.hi*y.lo);
455		t2 = Protect(x.lo*y.hi);
456		t1 = t1+t2;
457		c = c + t1;
458
459
460		hx = C+c;
461		tx = C-hx;
462		tx = tx+c;
463
464	296	x.hi = hx;
465	297	x.lo = tx;
466	298	END_FIX
467		return x;
468		}
469
470		quad_float operator /(const quad_float& x, const quad_float& y ) {
	299	}
	300
	301
	302	NO_INLINE void quad_float_in_place_div(quad_float& x, const quad_float& y ) {
471	303	START_FIX
472	304	DOUBLE hc, tc, hy, ty, C, c, U, u;
473	305	DOUBLE t1;

507	339	ty = C-hy;
508	340	ty = ty+c;
509	341
510		END_FIX
511		return quad_float(hy, ty);
512		}
513
514		quad_float& operator /=(quad_float& x, const quad_float& y ) {
515		START_FIX
516		DOUBLE hc, tc, hy, ty, C, c, U, u;
517		DOUBLE t1;
518
519		C = x.hi/y.hi;
520		c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
521		hc = c-C;
522		u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
523		hc = c-hc;
524		tc = C-hc;
525		hy = u-y.hi;
526		U = Protect(C * y.hi);
527		hy = u-hy;
528		ty = y.hi-hy;
529
530		// u = (((hchy-U)+hcty)+tchy)+tcty;
531
532		u = Protect(hc*hy);
533		u = u-U;
534		t1 = Protect(hc*ty);
535		u = u+t1;
536		t1 = Protect(tc*hy);
537		u = u+t1;
538		t1 = Protect(tc*ty);
539		u = u+t1;
540
541		// c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;
542
543		c = x.hi-U;
544		c = c-u;
545		c = c+x.lo;
546		t1 = Protect(C*y.lo);
547		c = c - t1;
548		c = c/y.hi;
549
550		hy = C+c;
551		ty = C-hy;
552		ty = ty+c;
553
554	342	x.hi = hy;
555	343	x.lo = ty;
556	344	END_FIX
557		return x;
558		}
559
560
561		quad_float sqrt(const quad_float& y) {
562		if (y.hi < 0.0)
563		ArithmeticError("quad_float: square root of negative number");
564		if (y.hi == 0.0) return quad_float(0.0,0.0);
565
566		double c;
567		c = sqrt(y.hi);
568		ForceToMem(&c); // This is fairly paranoid, but it doesn't cost too much.
569
570		START_FIX
571
	345	}
	346
	347
	348	NO_INLINE void quad_float_in_place_sqrt(quad_float& y, double& c_ref) {
	349	START_FIX
	350	DOUBLE c = c_ref;
572	351	DOUBLE p,q,hx,tx,u,uu,cc;
573	352	DOUBLE t1;
574	353

596	375	hx = c+cc;
597	376	tx = c-hx;
598	377	tx = tx+cc;
599		END_FIX
600		return quad_float(hx, tx);
601		}
602
603
604
605		void power(quad_float& z, const quad_float& a, long e)
	378
	379	y.hi = hx;
	380	y.lo = tx;
	381	END_FIX
	382	}
	383
	384
	385	NO_INLINE void quad_float_PrecisionOK(long& res, const double& one)
606	386	{
607		quad_float res, u;
608		unsigned long k;
609
610		if (e < 0)
611		k = -((unsigned long) e);
612		else
613		k = e;
614
615		res = 1.0;
616		u = a;
617
618		while (k) {
619		if (k & 1)
620		res = res * u;
621
622		k = k >> 1;
623		if (k)
624		u = u * u;
625		}
626
627		if (e < 0)
628		z = 1.0/res;
629		else
630		z = res;
631		}
632
633
634		void power2(quad_float& z, long e)
635		{
636		z.hi = _ntl_ldexp(1.0, e);
637		z.lo = 0;
638		}
639
640
641		long to_long(const quad_float& x)
642		{
643		double fhi, flo;
644
645		fhi = floor(x.hi);
646
647		if (fhi == x.hi)
648		flo = floor(x.lo);
649		else
650		flo = 0;
651
652		// the following code helps to prevent unnecessary integer overflow,
653		// and guarantees that to_long(to_quad_float(a)) == a, for all long a,
654		// provided long's are not too wide.
655
656		if (fhi > 0)
657		return long(flo) - long(-fhi);
658		else
659		return long(fhi) + long(flo);
660		}
661
662
663
664		// This version of ZZ to quad_float coversion relies on the
665		// precise rounding rules implemented by the ZZ to double conversion.
666
667
668		void conv(quad_float& z, const ZZ& a)
669		{
670		double xhi, xlo;
671
672		conv(xhi, a);
673
674		if (!IsFinite(&xhi)) {
675		z.hi = xhi;
676		z.lo = 0;
677		return;
678		}
679
680		NTL_ZZRegister(t);
681
682		conv(t, xhi);
683		sub(t, a, t);
684
685		conv(xlo, t);
686
687		normalize(z, xhi, xlo);
688
689		// The following is just paranoia.
690		if (fabs(z.hi) < NTL_FDOUBLE_PRECISION && z.lo != 0)
691		LogicError("internal error: ZZ to quad_float conversion");
692		}
693
694		void conv(ZZ& z, const quad_float& x)
695		{
696		NTL_ZZRegister(t1);
697		NTL_ZZRegister(t2);
698		NTL_ZZRegister(t3);
699
700		double fhi, flo;
701
702		fhi = floor(x.hi);
703
704		if (fhi == x.hi) {
705		flo = floor(x.lo);
706
707		conv(t1, fhi);
708		conv(t2, flo);
709
710		add(z, t1, t2);
711		}
712		else
713		conv(z, fhi);
714		}
715
716
717
718		ostream& operator<<(ostream& s, const quad_float& a)
719		{
720		quad_float aa = a;
721
722		if (!IsFinite(&aa)) {
723		s << "NaN";
724		return s;
725		}
726
727		RRPush push;
728		RROutputPush opush;
729
730		RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
731		RR::SetOutputPrecision(quad_float::oprec);
732
733		NTL_TLS_LOCAL(RR, t);
734
735		conv(t, a);
736		s << t;
737
738		return s;
739		}
740
741		istream& operator>>(istream& s, quad_float& x)
742		{
743		RRPush push;
744		RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
745
746		NTL_TLS_LOCAL(RR, t);
747		NTL_INPUT_CHECK_RET(s, s >> t);
748		conv(x, t);
749
750		return s;
751		}
752
753		void random(quad_float& x)
754		{
755		RRPush push;
756		RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
757
758		NTL_TLS_LOCAL(RR, t);
759		random(t);
760		conv(x, t);
761		}
762
763		quad_float random_quad_float()
764		{
765		quad_float x;
766		random(x);
767		return x;
768		}
769
770		long IsFinite(quad_float *x)
771		{
772		return IsFinite(&x->hi) && IsFinite(&x->lo);
773		}
774
775
776		long PrecisionOK()
777		{
778	387	START_FIX
779	388	long k;
780		DOUBLE l1 = (double)1;
781		DOUBLE lh = 1/(double)2;
	389	DOUBLE l1 = one;
	390	DOUBLE lh = one/double(2);
782	391	DOUBLE epsilon;
783	392	DOUBLE fudge, oldfudge;
784	393

794	403	fudge = l1 + epsilon;
795	404	} while (fudge > l1 && fudge < oldfudge);
796	405
797		END_FIX
798		return k == NTL_DOUBLE_PRECISION;
799		}
800
801		quad_float floor(const quad_float& x)
802		{
803		double fhi = floor(x.hi);
804
805		if (fhi != x.hi)
806		return quad_float(fhi, 0.0);
807		else {
808		double flo = floor(x.lo);
809		quad_float z;
810		normalize(z, fhi, flo);
811		return z;
812		}
813		}
814
815
816		quad_float ceil(const quad_float& x) {
817		return -floor(-x);
818		}
819
820		quad_float trunc(const quad_float& x) {
821		if (x>=0.0) return floor(x); else return -floor(-x);
822		}
823
824
825
826		long compare(const quad_float& x, const quad_float& y)
827		{
828		if (x.hi > y.hi)
829		return 1;
830		else if (x.hi < y.hi)
831		return -1;
832		else if (x.lo > y.lo)
833		return 1;
834		else if (x.lo < y.lo)
835		return -1;
836		else
837		return 0;
838		}
839
840
841		quad_float fabs(const quad_float& x)
842		{ if (x.hi>=0.0) return x; else return -x; }
843
844
845		quad_float ldexp(const quad_float& x, long exp) { // x*2^exp
846		double xhi, xlo;
847		quad_float z;
848
849		xhi = _ntl_ldexp(x.hi, exp);
850		xlo = _ntl_ldexp(x.lo, exp);
851
852		normalize(z, xhi, xlo);
853		return z;
854		}
855
856
857		quad_float exp(const quad_float& x) { // New version 97 Aug 05
858		/*
859		! Calculate a quadruple-precision exponential
860		! Method:
861		! x x.log2(e) nint[x.log2(e)] + frac[x.log2(e)]
862		! e = 2 = 2
863		!
864		! iy fy
865		! = 2 . 2
866		! Then
867		! fy y.loge(2)
868		! 2 = e
869		!
870		! Now y.loge(2) will be less than 0.3466 in absolute value.
871		! This is halved and a Pade aproximation is used to approximate e^x over
872		! the region (-0.1733, +0.1733). This approximation is then squared.
873		*/
874		if (x.hi<DBL_MIN_10_EXP*2.302585092994045684017991)
875		return to_quad_float(0.0);
876		if (x.hi>DBL_MAX_10_EXP*2.302585092994045684017991) {
877		ResourceError("exp(quad_float): overflow");
878		}
879
880		static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
881		// GLOBAL (assumes C++11 thread-safe init)
882
883		quad_float y,temp,ysq,sum1,sum2;
884		long iy;
885		y=x/Log2;
886		temp = floor(y+0.5);
887		iy = to_long(temp);
888		y=(y-temp)*Log2;
889		y=ldexp(y,-1L);
890		ysq=y*y;
891		sum1=y((((ysq+3960.0)ysq+2162160.0)ysq+302702400.0)ysq+8821612800.0);
892		sum2=(((90.0ysq+110880.0)ysq+30270240.0)ysq+2075673600.0)ysq+17643225600.0;
893		/*
894		! sum2 + sum1 2.sum1
895		! Now approximation = ----------- = 1 + ----------- = 1 + 2.temp
896		! sum2 - sum1 sum2 - sum1
897		!
898		! Then (1 + 2.temp)^2 = 4.temp.(1 + temp) + 1
899		*/
900		temp=sum1/(sum2-sum1);
901		y=temp*(temp+1);
902		y=ldexp(y,2L);
903		return ldexp(y+1,iy);
904		}
905
906		quad_float log(const quad_float& t) { // Newton method. See Bailey, MPFUN
907		if (t.hi <= 0.0) {
908		ArithmeticError("log(quad_float): argument must be positive");
909		}
910		double s1 = log(t.hi);
911		ForceToMem(&s1); // Again, this is fairly paranoid.
912		quad_float s;
913		s = s1;
914		quad_float e;
915		e=exp(s);
916		return s+(t-e)/e; // Newton step
917		}
918
919		long operator> (const quad_float& x, const quad_float& y) {
920		return (x.hi> y.hi) \|\| (x.hi==y.hi && x.lo> y.lo); }
921		long operator>=(const quad_float& x, const quad_float& y) {
922		return (x.hi>y.hi) \|\| (x.hi==y.hi && x.lo>=y.lo); }
923		long operator< (const quad_float& x, const quad_float& y) {
924		return (x.hi< y.hi) \|\| (x.hi==y.hi && x.lo< y.lo); }
925		long operator<=(const quad_float& x, const quad_float& y) {
926		return (x.hi<y.hi) \|\| (x.hi==y.hi && x.lo<=y.lo); }
927		long operator==(const quad_float& x, const quad_float& y)
928		{ return x.hi==y.hi && x.lo==y.lo; }
929		long operator!=(const quad_float& x, const quad_float& y)
930		{ return x.hi!=y.hi \|\| x.lo!=y.lo; }
	406	res = (k == NTL_DOUBLE_PRECISION);
	407	END_FIX
	408	}
	409
	410
931	411
932	412
933	413	NTL_END_IMPL

+386

-0

src/quad_float1.cpp less more

	0	#include <NTL/quad_float.h>
	1	#include <NTL/RR.h>
	2
	3	#include <cfloat>
	4
	5	NTL_START_IMPL
	6
	7
	8	#if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
	9
	10
	11	quad_float to_quad_float(long n)
	12	{
	13	double xhi, xlo;
	14
	15	xhi = TrueDouble(n);
	16
	17	// Because we are assuming 2's compliment integer
	18	// arithmetic, the following prevents long(xhi) from overflowing.
	19
	20	if (n > 0)
	21	xlo = TrueDouble(n+long(-xhi));
	22	else
	23	xlo = TrueDouble(n-long(xhi));
	24
	25	// renormalize...just to be safe
	26
	27	quad_float z;
	28	quad_float_normalize(z, xhi, xlo);
	29	return z;
	30	}
	31
	32	quad_float to_quad_float(unsigned long n)
	33	{
	34	double xhi, xlo, t;
	35
	36	const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
	37
	38	xhi = TrueDouble(n);
	39
	40	if (xhi >= bnd)
	41	t = xhi - bnd;
	42	else
	43	t = xhi;
	44
	45	// we use the "to_long" function here to be as portable as possible.
	46	long llo = to_long(n - (unsigned long)(t));
	47	xlo = TrueDouble(llo);
	48
	49	quad_float z;
	50	quad_float_normalize(z, xhi, xlo);
	51	return z;
	52	}
	53	#endif
	54
	55
	56	NTL_CHEAP_THREAD_LOCAL
	57	long quad_float::oprec = 10;
	58
	59	void quad_float::SetOutputPrecision(long p)
	60	{
	61	if (p < 1) p = 1;
	62
	63	if (NTL_OVERFLOW(p, 1, 0))
	64	ResourceError("quad_float: output precision too big");
	65
	66	oprec = p;
	67	}
	68
	69
	70
	71	void power(quad_float& z, const quad_float& a, long e)
	72	{
	73	quad_float res, u;
	74	unsigned long k;
	75
	76	if (e < 0)
	77	k = -((unsigned long) e);
	78	else
	79	k = e;
	80
	81	res = 1.0;
	82	u = a;
	83
	84	while (k) {
	85	if (k & 1)
	86	res = res * u;
	87
	88	k = k >> 1;
	89	if (k)
	90	u = u * u;
	91	}
	92
	93	if (e < 0)
	94	z = 1.0/res;
	95	else
	96	z = res;
	97	}
	98
	99
	100	void power2(quad_float& z, long e)
	101	{
	102	z.hi = _ntl_ldexp(1.0, e);
	103	z.lo = 0;
	104	}
	105
	106
	107	long to_long(const quad_float& x)
	108	{
	109	double fhi, flo;
	110
	111	fhi = floor(x.hi);
	112
	113	if (fhi == x.hi)
	114	flo = floor(x.lo);
	115	else
	116	flo = 0;
	117
	118	// the following code helps to prevent unnecessary integer overflow,
	119	// and guarantees that to_long(to_quad_float(a)) == a, for all long a,
	120	// provided long's are not too wide.
	121
	122	if (fhi > 0)
	123	return long(flo) - long(-fhi);
	124	else
	125	return long(fhi) + long(flo);
	126	}
	127
	128
	129
	130	// This version of ZZ to quad_float coversion relies on the
	131	// precise rounding rules implemented by the ZZ to double conversion.
	132
	133
	134	void conv(quad_float& z, const ZZ& a)
	135	{
	136	double xhi, xlo;
	137
	138	conv(xhi, a);
	139
	140	if (!IsFinite(&xhi)) {
	141	z.hi = xhi;
	142	z.lo = 0;
	143	return;
	144	}
	145
	146	NTL_ZZRegister(t);
	147
	148	conv(t, xhi);
	149	sub(t, a, t);
	150
	151	conv(xlo, t);
	152
	153	quad_float_normalize(z, xhi, xlo);
	154	}
	155
	156	void conv(ZZ& z, const quad_float& x)
	157	{
	158	NTL_ZZRegister(t1);
	159	NTL_ZZRegister(t2);
	160	NTL_ZZRegister(t3);
	161
	162	double fhi, flo;
	163
	164	fhi = floor(x.hi);
	165
	166	if (fhi == x.hi) {
	167	flo = floor(x.lo);
	168
	169	conv(t1, fhi);
	170	conv(t2, flo);
	171
	172	add(z, t1, t2);
	173	}
	174	else
	175	conv(z, fhi);
	176	}
	177
	178
	179
	180	ostream& operator<<(ostream& s, const quad_float& a)
	181	{
	182	quad_float aa = a;
	183
	184	if (!IsFinite(&aa)) {
	185	s << "NaN";
	186	return s;
	187	}
	188
	189	RRPush push;
	190	RROutputPush opush;
	191
	192	RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
	193	RR::SetOutputPrecision(quad_float::oprec);
	194
	195	NTL_TLS_LOCAL(RR, t);
	196
	197	conv(t, a);
	198	s << t;
	199
	200	return s;
	201	}
	202
	203	istream& operator>>(istream& s, quad_float& x)
	204	{
	205	RRPush push;
	206	RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
	207
	208	NTL_TLS_LOCAL(RR, t);
	209	NTL_INPUT_CHECK_RET(s, s >> t);
	210	conv(x, t);
	211
	212	return s;
	213	}
	214
	215	void random(quad_float& x)
	216	{
	217	RRPush push;
	218	RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
	219
	220	NTL_TLS_LOCAL(RR, t);
	221	random(t);
	222	conv(x, t);
	223	}
	224
	225	quad_float random_quad_float()
	226	{
	227	quad_float x;
	228	random(x);
	229	return x;
	230	}
	231
	232	long IsFinite(quad_float *x)
	233	{
	234	return IsFinite(&x->hi) && IsFinite(&x->lo);
	235	}
	236
	237
	238	quad_float floor(const quad_float& x)
	239	{
	240	double fhi = floor(x.hi);
	241
	242	if (fhi != x.hi)
	243	return quad_float(fhi, 0.0);
	244	else {
	245	double flo = floor(x.lo);
	246	quad_float z;
	247	quad_float_normalize(z, fhi, flo);
	248	return z;
	249	}
	250	}
	251
	252
	253	quad_float ceil(const quad_float& x) {
	254	return -floor(-x);
	255	}
	256
	257	quad_float trunc(const quad_float& x) {
	258	if (x>=0.0) return floor(x); else return -floor(-x);
	259	}
	260
	261
	262
	263	long compare(const quad_float& x, const quad_float& y)
	264	{
	265	if (x.hi > y.hi)
	266	return 1;
	267	else if (x.hi < y.hi)
	268	return -1;
	269	else if (x.lo > y.lo)
	270	return 1;
	271	else if (x.lo < y.lo)
	272	return -1;
	273	else
	274	return 0;
	275	}
	276
	277
	278	quad_float fabs(const quad_float& x)
	279	{ if (x.hi>=0.0) return x; else return -x; }
	280
	281
	282	quad_float ldexp(const quad_float& x, long exp) { // x*2^exp
	283	double xhi, xlo;
	284	quad_float z;
	285
	286	xhi = _ntl_ldexp(x.hi, exp);
	287	xlo = _ntl_ldexp(x.lo, exp);
	288
	289	quad_float_normalize(z, xhi, xlo);
	290	return z;
	291	}
	292
	293
	294	quad_float exp(const quad_float& x) { // New version 97 Aug 05
	295	/*
	296	! Calculate a quadruple-precision exponential
	297	! Method:
	298	! x x.log2(e) nint[x.log2(e)] + frac[x.log2(e)]
	299	! e = 2 = 2
	300	!
	301	! iy fy
	302	! = 2 . 2
	303	! Then
	304	! fy y.loge(2)
	305	! 2 = e
	306	!
	307	! Now y.loge(2) will be less than 0.3466 in absolute value.
	308	! This is halved and a Pade aproximation is used to approximate e^x over
	309	! the region (-0.1733, +0.1733). This approximation is then squared.
	310	*/
	311	if (x.hi<DBL_MIN_10_EXP*2.302585092994045684017991)
	312	return to_quad_float(0.0);
	313	if (x.hi>DBL_MAX_10_EXP*2.302585092994045684017991) {
	314	ResourceError("exp(quad_float): overflow");
	315	}
	316
	317	static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
	318	// GLOBAL (assumes C++11 thread-safe init)
	319
	320	quad_float y,temp,ysq,sum1,sum2;
	321	long iy;
	322	y=x/Log2;
	323	temp = floor(y+0.5);
	324	iy = to_long(temp);
	325	y=(y-temp)*Log2;
	326	y=ldexp(y,-1L);
	327	ysq=y*y;
	328	sum1=y((((ysq+3960.0)ysq+2162160.0)ysq+302702400.0)ysq+8821612800.0);
	329	sum2=(((90.0ysq+110880.0)ysq+30270240.0)ysq+2075673600.0)ysq+17643225600.0;
	330	/*
	331	! sum2 + sum1 2.sum1
	332	! Now approximation = ----------- = 1 + ----------- = 1 + 2.temp
	333	! sum2 - sum1 sum2 - sum1
	334	!
	335	! Then (1 + 2.temp)^2 = 4.temp.(1 + temp) + 1
	336	*/
	337	temp=sum1/(sum2-sum1);
	338	y=temp*(temp+1);
	339	y=ldexp(y,2L);
	340	return ldexp(y+1,iy);
	341	}
	342
	343	quad_float log(const quad_float& t) { // Newton method. See Bailey, MPFUN
	344	if (t.hi <= 0.0) {
	345	ArithmeticError("log(quad_float): argument must be positive");
	346	}
	347
	348	quad_float s = to_quad_float(log(t.hi));
	349	// NOTE: in case log yields excess precision, this assumes
	350	// that to_quad_float removes it
	351
	352	quad_float e = exp(s);
	353	return s+(t-e)/e; // Newton step
	354	}
	355
	356	quad_float sqrt(const quad_float& y)
	357	{
	358	if (y.hi < 0.0)
	359	ArithmeticError("quad_float: square root of negative number");
	360	if (y.hi == 0.0) return quad_float(0.0,0.0);
	361
	362	double c = TrueDouble(sqrt(y.hi));
	363	// NOTE: we call TrueDouble, just in case sqrt yields excess precision
	364
	365	quad_float yy = y;
	366	quad_float_in_place_sqrt(yy, c);
	367	return yy;
	368	}
	369
	370
	371	long operator> (const quad_float& x, const quad_float& y) {
	372	return (x.hi> y.hi) \|\| (x.hi==y.hi && x.lo> y.lo); }
	373	long operator>=(const quad_float& x, const quad_float& y) {
	374	return (x.hi>y.hi) \|\| (x.hi==y.hi && x.lo>=y.lo); }
	375	long operator< (const quad_float& x, const quad_float& y) {
	376	return (x.hi< y.hi) \|\| (x.hi==y.hi && x.lo< y.lo); }
	377	long operator<=(const quad_float& x, const quad_float& y) {
	378	return (x.hi<y.hi) \|\| (x.hi==y.hi && x.lo<=y.lo); }
	379	long operator==(const quad_float& x, const quad_float& y)
	380	{ return x.hi==y.hi && x.lo==y.lo; }
	381	long operator!=(const quad_float& x, const quad_float& y)
	382	{ return x.hi!=y.hi \|\| x.lo!=y.lo; }
	383
	384
	385	NTL_END_IMPL