Codebase list ntl / 12851b6
New upstream version 11.3.0 Julien Puydt 5 years ago
134 changed file(s) with 11281 addition(s) and 3983 deletion(s). Raw diff Collapse all Expand all
0 NTL -- a library for doing numbery theory -- version 11.0.0
1 Release date: 2018.04.07
0 NTL -- a library for doing numbery theory -- version 11.3.0
1 Release date: 2018.08.17
22
33 Author: Victor Shoup (victor@shoup.net)
44
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/BasicThreadPool.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/BasicThreadPool.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2E.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2E.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2EX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2EX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2EXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2EXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2X.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2X.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2XFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2XFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/GF2XVec.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/GF2XVec.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/HNF.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/HNF.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/LLL.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/LLL.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/Lazy.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/Lazy.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/LazyTable.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/LazyTable.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/RR.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/RR.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/SmartPtr.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/SmartPtr.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
Binary diff not shown
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
293293 <span class="Comment">// The implementation may or may not Euclid's algorithm,</span>
294294 <span class="Comment">// but the coefficients a and t are always computed as if </span>
295295 <span class="Comment">// it did.</span>
296
297 <span class="Comment">// In particular, the following inequalties should hold:</span>
298 <span class="Comment">// |s| &lt;= 1 OR |s| &lt; |b|/(2*d)</span>
299 <span class="Comment">// |t| &lt;= 1 OR |t| &lt; |a|/(2*d)</span>
300
296301
297302
298303 <span class="Comment">// special-purpose single-precision variants:</span>
261261 // The implementation may or may not Euclid's algorithm,
262262 // but the coefficients a and t are always computed as if
263263 // it did.
264
265 // In particular, the following inequalties should hold:
266 // |s| <= 1 OR |s| < |b|/(2*d)
267 // |t| <= 1 OR |t| < |a|/(2*d)
268
264269
265270
266271 // special-purpose single-precision variants:
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZVec.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZVec.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_limbs.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_limbs.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_pEX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pEX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_pEXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pEXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_pX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/ZZ_pXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/ZZ_pXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
3838 NTL_STD_CXX11=on # Build assuming C++11 features
3939
4040 NTL_SAFE_VECTORS=on # build in "safe vector" mode
41
42 NTL_ENABLE_AVX_FFT=off # implement the small-prime FFT using AVX
43 # instructions...this is experimental at
44 # moment, and may lead to worse performance
45
46 NTL_AVOID_AVX512=off # avoid using 512-bit AVX registers
4147
4248
4349 ########## Here are more detailed description of these variables.
224230 # details.
225231
226232
233 ############ AVX FFT
234
235 NTL_ENABLE_AVX_FFT=off # implement the small-prime FFT using AVX
236 # instructions...this is experimental at
237 # moment, and may lead to worse performance
238
239 On machines with AVX2/FMA or AVX512, this will implement the small-prime FFT
240 using AVX code. This is still quite experimental, and may lead to worse
241 performance. While the FFT itself can run 2-3 times faster, this comes at the
242 cost of (1) restriction to 50-bit primes (so NTL_SP_NBITS will be set to 50
243 instead of 60), and (2) the CPU speed may be throttled, slowing down other
244 operations. So far, it seems that the only operations that are faster are
245 arithmetic operations in zz_pX, and only for certain choices of modulus.
246 Arithmetic operations in ZZ_pX, with large modulus, can run slower with AVX
247 than without.
248
249
250 ########### Avoid 512-bit AVX registers
251
252 NTL_AVOID_AVX512=off # avoid using 512-bit AVX registers
253
254 Even if available, this will avoid the use of 512-bit AVX registers. This
255 affects both Mat<zz_p> operations, as well as the AVX-based FFT (see above).
256
227257
228258
229259
628658 NTL_FFT_BIGTAB=off
629659
630660 # Precomputed tables are used to store all the roots of unity
631 # used in FFT computations.
661 # used in FFT computations.
632662
633663
634664 NTL_FFT_LAZYMUL=off
Binary diff not shown
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_pEX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pEX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_pEXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pEXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_pX.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pX.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/lzz_pXFactoring.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/lzz_pXFactoring.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_GF2.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_GF2.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_GF2E.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_GF2E.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_RR.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_RR.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_ZZ.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_ZZ_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_ZZ_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_ZZ_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_lzz_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_lzz_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_lzz_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_lzz_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_poly_ZZ.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_ZZ.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_poly_ZZ_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_ZZ_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/mat_poly_lzz_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/mat_poly_lzz_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/matrix.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/matrix.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/pair.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/pair.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/quad_float.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/quad_float.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/tools.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/tools.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
7171 for their collaboration and support over the years.
7272
7373
74 <li>
75 Thanks to
76 <a href="http://web.maths.unsw.edu.au/~davidharvey/">David Harvey</a>
77 for numerous improvements to NTL's FFT code. The current version
78 of NTL's FFT is derived from code originally written by David.
79
80 <li>
81 Thanks to
82 <a href="http://personales.unican.es/taberalf/">Luis Felipe Tabera Alonso</a>
83 for porting the fast
84 GCD and XGCD code to <tt>GF2EX</tt>, <tt>zz_pEX</tt>, and <tt>ZZ_pEX</tt>,
85 and for testing and tuning the code.
86
87
7488 </ul>
7589
7690
1414 A Tour of NTL: Summary of Changes
1515 </p>
1616 </h1>
17
18
19 <p><hr><p>
20 <h3>
21 2018.08.17: Changes between NTL 11.2.1 and 11.3.0
22 </h3>
23
24 <ul>
25 <li>
26 <b>Implemented an AVX-based small-prime FFT</b> (which works with
27 both AVX2 and AVX512)
28 <ul>
29 <li>
30 This can give a 2-3x speedup for the FFT.
31 <li>
32 However, it is not enabled by default, because it reduces
33 that small-prime size bound from 60 bits to 50 bits,
34 and may slow down certain computations.
35 <li>
36 The reasons for this unfortunate slowdown are that some CRT-based computations
37 may slow down because of the smaller prime size, and because
38 Intel CPUs may slow themselves down when executing AVX
39 instructions.
40 <li>
41 To enable this feature, configure with <tt>NTL_ENABLE_AVX_FFT=on</tt>.
42 <li>
43 Here are some running times on a Skylake Xeon machine
44 (<tt>Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz</tt>).
45 For various values of <i>n</i>,
46 we measure the time to compute <i>a<sup>e</sup>&nbsp;</i>mod<i>&nbsp;f</i>,
47 where <i>f</i> is a random monic polynomial of degree <i>n</i>
48 over <i>Z<sub>17</sub></i>,
49 <i>a</i> is a random polynomial of degree less than <i>n</i>,
50 and <i>e=2<sup>n</sup>-1</i>.
51 <pre>
52 n 1024 2048 4096 8192 16384
53 non-AVX 0.171 0.741 3.192 14.348 60.812
54 AVX512 0.089 0.372 1.648 7.740 35.588
55 </pre>
56 </ul>
57
58 <p>
59 <li>
60 <b>Implemented AVX512 instruction sequences</b>
61 <ul>
62 <li>
63 This affects <tt>Mat&lt;zz_p&gt;</tt> arithmetic and the small-prime FFT.
64 <li>
65 Becuause AVX512 instructions can in certain situations
66 lead to slower computations (because of CPU "throttling"),
67 this feature can be disabled by configuring with
68 <tt>NTL_AVOID_AVX512=on</tt>.
69 </ul>
70
71 <p>
72 <li>
73 <b>Performance tuned <tt>GF2EX</tt> arithmetic</b>
74 <ul>
75 <li>
76 Tuned crossovers for various algorithms.
77 </ul>
78
79 <p>
80 <li>
81 <b>Implemented asymptotocially fast GCD and XGCD for
82 <tt>GF2EX</tt>, <tt>zz_pEX</tt>, and <tt>ZZ_pEX</tt></b>
83 <ul>
84 <li>
85 Some work may still need to be done to fine tune
86 the crossovers, but they should be pretty good as is.
87 <li>
88 Many thanks to
89 <a href="http://personales.unican.es/taberalf/">Luis Felipe Tabera Alonso</a> for porting the code,
90 as well as testing and tuning it.
91 </ul>
92
93 <p>
94 <li>
95 <b>Other small changes</b>
96 <ul>
97 <li>
98 Restructured <tt>quad_float</tt> implemenation to isolate better
99 the parts that are dependent on correct FP rounding.
100 <li>
101 Standardized vector growth rate to 1.5 via the function <tt>_ntl_vec_grow</tt>.
102 <li>
103 Got rid of most uses of <tt>__restrict</tt> in <tt>mat_lzz_p.cpp</tt>,
104 some of which were technically UB.
105 <li>
106 Got rid of some uses of <tt>#warning</tt>, which are not portable.
107 </ul>
108
109 </ul>
110
111 <p><hr><p>
112 <h3>
113 2018.07.15: Changes between NTL 11.2.0 and 11.2.1
114 </h3>
115
116 <ul>
117 <li>
118 Fixed an embarrassing bug, introduced in NTL 11.2.0,
119 in which <tt>add(ZZ,ZZ,long)</tt>
120 and <tt>sub(ZZ,ZZ,long)</tt> would give incorrect result
121 if third argument was zero.
122 <li>
123 Fixed incorrect libtool version number in NTL 11.2.0.
124 </ul>
125
126 <p><hr><p>
127 <h3>
128 2018.07.07: Changes between NTL 11.1.0 and 11.2.0
129 </h3>
130
131 <ul>
132 <li>
133 <b>Complete re-write of the
134 Schoenhage-Strassen FFT for <tt>ZZX</tt> arithmetic.</b>
135
136 <ul>
137 <li>
138 Implementation of "truncated" FFT
139 <li>
140 Implementaion of "sqrt 2" trick
141 <li>
142 More efficient implementation of low-level butterfly operations
143 <li>
144 Here is some timing data comparing <tt>ZZX</tt> multiplication times
145 of NTL 11.0 and 11.2.
146 The entries are the ratio of the 11.0-time over the 11.2-time (so
147 the bigger the number, the bigger the improvement).
148 The rows are labeled by the bit-length <i>k</i> of the coefficient,
149 the column by the degree bound <i>n</i>.
150 Unlabeled columns represent degree bounds half-way between the labeled
151 ones.
152
153
154 <p>
155
156 <img src="zmulrat.jpg" border="0" style="display: block; width: 60%; height: auto;">
157
158 <p>
159 <li>
160 For multiplication in <tt>ZZX</tt>, NTL and
161 <a href="http://www.flintlib.org">FLINT</a>
162 now have comparable performance across a wide range
163 of parameter sizes, with NTL being 2x faster for some parameters,
164 and FLITNt being 1.5x faster in others.
165 Here is a chart showing the ratio of FLINT time
166 over NTL time (so the bigger the number, the faster NTL is relative to FLINT)
167 <p>
168
169 <img src="flintrat.jpg" border="0" style="display: block; width: 60%; height: auto;">
170
171 <p>
172 <li>
173 See also this report on
174 <a href="http://www.shoup.net/ntl/benchmarks.pdf">NTL vs FLINT</a>
175 for detailed benchmarks that compare the performance NTL and FLINT
176 on a number of operations and parameter settings.
177
178 <li>
179 Future plans for NTL's Schoenhage-Strassen code:
180 <ul>
181 <li>
182 Implement something like Bailey's 4-step variant
183 (which should yield better cache behavior)
184 <li>
185 Thread boosting (built on top of the 4-step variant)
186 </ul>
187 </ul>
188
189 <li>
190 Some fine tuning of the new small-prime
191 truncated-FFT implementation introduced in version 11.0.
192 <li>
193 Fixed obscure bug in new small-prime FFT code: this only affects
194 users who call low-level, undocumented FFT routines
195 on transforms of size 2, so it is unlikely to have affected
196 any real code.
197
198 <li>
199 Performance improvements to <tt>ZZ+long</tt> and <tt>ZZ-long</tt>
200 routines (and by extension <tt>ZZ+=long</tt>, <tt>ZZ-=long</tt>,
201 <tt>ZZ++</tt>, and <tt>ZZ--</tt>)
202
203
204
205 </ul>
206
207 <p><hr><p>
208 <h3>
209 2018.06.07: Changes between NTL 11.0.0 and 11.1.0
210 </h3>
211
212 <ul>
213 <li>
214 <b>Complete re-write of the low-level "small-prime" FFT (a.k.a., NTT).</b>
215
216 <ul>
217 <li> This implements a "truncated" FFT, which can speed up
218 polynomial multiplication by a factor of two, and which
219 mainly eliminates "jumps" in the running time at powers of two.
220 The new FFT routines are in fact a bit faster even at powers of two.
221
222 <li> Some low-level interfaces have changed, but these are
223 all <i>undocumented</i>, so should not cause
224 any problems for clients that don't inappropriately
225 use such interfaces.
226
227
228 <li>
229 Here is some timing data comparing the new (truncated) FFT to
230 the old (plain) FFT. <i>x</i>-axis is degree bound,
231 <i>y</i>-axis is time (in seconds), shown on a log/log scale.
232 This is the time to multiply two polynomials modulo
233 a single-precision "FFT" prime (60 bits).
234
235 <p>
236
237 <img src="TFT-time.jpg" border="0" style="display: block; width: 40%; height: auto;">
238
239 </ul>
240
241 <p>
242 <li>
243 <b>Improved performance of ZZ mul and sqr on small inputs</b>
244 <ul>
245 <li>mul speedup: 1 limb: 2.5x; 2 limbs: 1.4x; 3 limbs: 1.3x.
246 <li>NTL now makes explicit calls to <tt>mpn_sqr</tt> and
247 requires GMP version 5.0 or later.
248 </ul>
249
250 <p>
251 <li>
252 <b>Other changes:</b>
253 <ul>
254 <li>
255 Changed header files to make Windows installation more reliable,
256 especially for IDE's like Code Blocks
257
258 <li>
259 Added documentation for the <tt>GCD</tt> routine in the <tt>ZZ</tt> module
260
261 <li>
262 Fixed a bit of UB in the <tt>lip.h</tt> interface (<tt>_ntl_gbigint_body</tt>)
263 </ul>
264
265 </ul>
17266
18267
19268 <p><hr><p>
669918
670919
671920 </ul>
672
921 </ul>
673922
674923
675924 <p><hr><p>
3434 <p>
3535 <pre>
3636
37 multiply 1000-bit ints: 1.7641e-07
38 square 1000-bit ints: 1.20344e-07
39 remainder 2000/1000-bit ints: 3.59872e-07
40 gcd 1000-bit ints: 2.83256e-06
41 xgcd 1000-bit ints: 4.32945e-06
42 power mod 1000-bit ints: 0.000441862
43 multiply degree-1000 poly mod 1000-bit prime: 0.00433029
44 remainder degree-2000/1000 poly mod 1000-bit prime: 0.0125181
45 preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00441719
46 gcd degree-1000 poly mod 1000-bit prime: 0.123718
47 multiply degree-1000 int poly with 1000-bit coeffs: 0.00612337
37 multiply 1000-bit ints: 1.77903e-07
38 square 1000-bit ints: 1.08537e-07
39 remainder 2000/1000-bit ints: 3.58799e-07
40 gcd 1000-bit ints: 2.86069e-06
41 xgcd 1000-bit ints: 4.27161e-06
42 power mod 1000-bit ints: 0.000424325
43 multiply degree-1000 poly mod 1000-bit prime: 0.0041019
44 remainder degree-2000/1000 poly mod 1000-bit prime: 0.0119166
45 preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00418589
46 gcd degree-1000 poly mod 1000-bit prime: 0.122145
47 multiply degree-1000 int poly with 1000-bit coeffs: 0.00467749
4848
4949 factoring degree-1000 poly mod 1000-bit prime...
50 square-free decomposition...0.123419
50 square-free decomposition...0.119126
5151 factoring multiplicity 1, deg = 1000
52 computing X^p...7.28103
53 computing DDF...generating baby steps...+++++++++++++++++++++2.78938
54 generating giant steps...++++++++++++++++++++++2.89548
52 computing X^p...6.89619
53 computing DDF...generating baby steps...+++++++++++++++++++++2.72505
54 generating giant steps...++++++++++++++++++++++2.82554
5555 giant refine...++++split 1 18
5656 *++++*++++*++++*++++split 17 355
5757 *split 0 627
58 giant refine time: 4.31472
58 giant refine time: 4.09811
5959 baby refine...split 1 1
6060 split 8 8
6161 split 9 9
6262 split 355 355
6363 split 627 627
64 baby refine time: 0.03662
65 DDF time: 10.0396
66 ...total time = 17.4524
64 baby refine time: 0.037111
65 DDF time: 9.6903
66 ...total time = 16.7138
6767
68 multiply 500-bit GF2Xs: 5.50411e-08
69 remainder 1000/500-bit GF2Xs: 8.22747e-07
70 gcd 500-bit GF2Xs: 3.52091e-06
68 multiply 500-bit GF2Xs: 5.3414e-08
69 remainder 1000/500-bit GF2Xs: 8.19842e-07
70 gcd 500-bit GF2Xs: 3.57209e-06
7171
72 factoring degree-500 GF2X: 0.000148627
73 gcd 500-bit GF2X: 3.54025e-06
74 multiply degree-500 poly mod 500-bit GF2X: 0.00247895
75 remainder degree-1000/500 poly mod 500-bit GF2X: 0.00889676
76 preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00500091
77 gcd degree-500 poly mod 500-bit GF2X: 0.0453614
72 factoring degree-500 GF2X: 0.000154251
73 gcd 500-bit GF2X: 3.55401e-06
74 multiply degree-500 poly mod 500-bit GF2X: 0.00247313
75 remainder degree-1000/500 poly mod 500-bit GF2X: 0.00889548
76 preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00498747
77 gcd degree-500 poly mod 500-bit GF2X: 0.0451361
7878
7979 factoring degree-500 poly mod 500-bit GF2X...
80 square-free decomposition...0.004363
80 square-free decomposition...0.004369
8181 factoring multiplicity 1, deg = 250
82 computing X^p...0.476299
83 computing DDF...generating baby steps...++++++++++0.32838
84 generating giant steps...+++++++++++0.351889
82 computing X^p...0.478202
83 computing DDF...generating baby steps...++++++++++0.329912
84 generating giant steps...+++++++++++0.355037
8585 giant refine...++++split 1 9
8686 split 2 13
8787 split 4 44
8888 *++++split 7 73
8989 *split 0 111
90 giant refine time: 0.229197
90 giant refine time: 0.230542
9191 baby refine...split 9 9
9292 split 13 13
9393 split 44 44
9494 split 73 73
9595 split 111 111
96 baby refine time: 0.001222
97 DDF time: 0.910726
96 baby refine time: 0.001228
97 DDF time: 0.916753
9898
99 ...total time = 1.39327
99 ...total time = 1.39667
100100
101101 </pre>
102102
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_GF2.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_GF2.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_GF2E.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_GF2E.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_RR.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_RR.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_ZZ.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_ZZ_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_ZZ_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_ZZ_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_lzz_p.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_lzz_p.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vec_lzz_pE.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vec_lzz_pE.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/vector.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/vector.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/version.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/version.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
11 <html>
22 <head>
33 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
4 <title>~/ntl-10.5.0test/doc/xdouble.cpp.html</title>
4 <title>~/ntl-staging/ntl-11.0.0updated/doc/xdouble.cpp.html</title>
55 <meta name="Generator" content="Vim/8.0">
66 <meta name="plugin-version" content="vim7.4_v2">
77 <meta name="syntax" content="cpp">
Binary diff not shown
66 #include <NTL/HAVE_PCLMUL.h>
77 #include <NTL/HAVE_AVX2.h>
88 #include <NTL/HAVE_FMA.h>
9 #include <NTL/HAVE_AVX512F.h>
910 #include <NTL/HAVE_COPY_TRAITS1.h>
1011 #include <NTL/HAVE_COPY_TRAITS2.h>
1112 #include <NTL/HAVE_CHRONO_TIME.h>
88 #include <NTL/LazyTable.h>
99
1010 NTL_OPEN_NNS
11
12 #define NTL_PROVIDES_TRUNC_FFT
1113
1214 #define NTL_FFTFudge (4)
1315 // This constant is used in selecting the correct
3537
3638
3739
38 class FFTVectorPair {
39 public:
40 Vec<long> wtab_precomp;
41 Vec<mulmod_precon_t> wqinvtab_precomp;
40 // PIPL pattern: FFTMulTabs defined in FFT.cpp
41 class FFTMulTabs;
42 struct FFTMulTabsDeleterPolicy {
43 static void deleter(FFTMulTabs *p);
4244 };
4345
44 typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
45
46
47 class FFTMulTabs {
48 public:
49
50 FFTMultipliers MulTab[2];
51
52 };
5346
5447 class zz_pInfoT; // forward reference, defined in lzz_p.h
5548
7669 Vec<mulmod_precon_t> TwoInvPreconTable;
7770 // mulmod preconditioning data
7871
79 UniquePtr< FFTMulTabs > bigtab;
72 UniquePtr< FFTMulTabs, FFTMulTabsDeleterPolicy > bigtab;
8073
8174 };
8275
83 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, bool bigtab);
76 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index);
8477
8578
8679 #define NTL_MAX_FFTPRIMES (20000)
133126 // allocates and initializes information for FFT prime
134127
135128
136 void FFT(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir);
137 // the low-level FFT routine.
138 // computes a 2^k point FFT modulo q = info.q
139 // dir == 0 => forward direction (using roots)
140 // dir == 1 => backwards direction (using inverse roots)
141
142
143
129 void new_fft(long* A, const long* a, long k,
130 const FFTPrimeInfo& info, long yn, long xn);
131
132 inline
133 void new_fft(long* A, const long* a, long k,
134 const FFTPrimeInfo& info)
135 { new_fft(A, a, k, info, 1L << k, 1L << k); }
136
137
138 void new_ifft(long* A, const long* a, long k,
139 const FFTPrimeInfo& info, long yn);
140
141 inline
142 void new_ifft(long* A, const long* a, long k,
143 const FFTPrimeInfo& info)
144 { new_ifft(A, a, k, info, 1L << k); }
145
146
147 void new_fft_flipped(long* A, const long* a, long k,
148 const FFTPrimeInfo& info);
149
150 void new_ifft_flipped(long* A, const long* a, long k,
151 const FFTPrimeInfo& info);
144152
145153
146154 inline
147155 void FFTFwd(long* A, const long *a, long k, const FFTPrimeInfo& info)
148 // Slightly higher level interface...using the ith FFT prime
149 {
150 FFT(A, a, k, info, 0);
151 }
152
156 {
157 new_fft(A, a, k, info);
158 }
159
160 inline
161 void FFTFwd_trunc(long* A, const long *a, long k, const FFTPrimeInfo& info,
162 long yn, long xn)
163 {
164 new_fft(A, a, k, info, yn, xn);
165 }
166
167 inline
168 void FFTFwd_trans(long* A, const long *a, long k, const FFTPrimeInfo& info)
169 {
170 new_ifft_flipped(A, a, k, info);
171 }
153172
154173 inline
155174 void FFTFwd(long* A, const long *a, long k, long i)
175 // Slightly higher level interface...using the ith FFT prime
156176 {
157177 FFTFwd(A, a, k, *FFTTables[i]);
158178 }
159179
160180 inline
161 void FFTRev(long* A, const long *a, long k, const FFTPrimeInfo& info)
162 // Slightly higher level interface...using the ith FFT prime
163 {
164 FFT(A, a, k, info, 1);
165 }
166
167 inline
168 void FFTRev(long* A, const long *a, long k, long i)
169 {
170 FFTRev(A, a, k, *FFTTables[i]);
171 }
172
173 inline
174 void FFTMulTwoInv(long* A, const long *a, long k, const FFTPrimeInfo& info)
175 {
176 VectorMulModPrecon(1L << k, A, a, info.TwoInvTable[k], info.q,
177 info.TwoInvPreconTable[k]);
178 }
179
180 inline
181 void FFTMulTwoInv(long* A, const long *a, long k, long i)
182 {
183 FFTMulTwoInv(A, a, k, *FFTTables[i]);
184 }
185
186 inline
181 void FFTFwd_trunc(long* A, const long *a, long k, long i, long yn, long xn)
182 // Slightly higher level interface...using the ith FFT prime
183 {
184 FFTFwd_trunc(A, a, k, *FFTTables[i], yn, xn);
185 }
186
187 inline
188 void FFTFwd_trans(long* A, const long *a, long k, long i)
189 // Slightly higher level interface...using the ith FFT prime
190 {
191 FFTFwd_trans(A, a, k, *FFTTables[i]);
192 }
193
194
195
196
197 inline
187198 void FFTRev1(long* A, const long *a, long k, const FFTPrimeInfo& info)
188 // FFTRev + FFTMulTwoInv
189 {
190 FFTRev(A, a, k, info);
191 FFTMulTwoInv(A, A, k, info);
192 }
193
194 inline
199 {
200 new_ifft(A, a, k, info);
201 }
202
203 inline
204 void FFTRev1_trunc(long* A, const long *a, long k, const FFTPrimeInfo& info,
205 long yn)
206 {
207 new_ifft(A, a, k, info, yn);
208 }
209
210 inline
211 void FFTRev1_trans(long* A, const long *a, long k, const FFTPrimeInfo& info)
212 {
213 new_fft_flipped(A, a, k, info);
214 }
215
216 inline
195217 void FFTRev1(long* A, const long *a, long k, long i)
218 // Slightly higher level interface...using the ith FFT prime
196219 {
197220 FFTRev1(A, a, k, *FFTTables[i]);
198221 }
222
223 inline
224 void FFTRev1_trunc(long* A, const long *a, long k, long i, long yn)
225 // Slightly higher level interface...using the ith FFT prime
226 {
227 FFTRev1_trunc(A, a, k, *FFTTables[i], yn);
228 }
229
230 inline
231 void FFTRev1_trans(long* A, const long *a, long k, long i)
232 // Slightly higher level interface...using the ith FFT prime
233 {
234 FFTRev1_trans(A, a, k, *FFTTables[i]);
235 }
236
199237
200238
201239 long IsFFTPrime(long n, long& w);
0
1 #ifndef NTL_FFT_impl__H
2 #define NTL_FFT_impl__H
3
4 #include <NTL/tools.h>
5
6 NTL_OPEN_NNS
7
8 #ifdef NTL_ENABLE_AVX_FFT
9
10 #if (!defined(NTL_HAVE_AVX512F) && !(defined(NTL_HAVE_AVX2) && defined(NTL_HAVE_FMA)))
11 #error "NTL_ENABLE_AVX_FFT: not supported on this platform"
12 #endif
13
14 #if (defined(NTL_HAVE_AVX512F) && !defined(NTL_AVOID_AVX512))
15 #define NTL_LG2_PDSZ (3)
16 #else
17 #define NTL_LG2_PDSZ (2)
18 #endif
19
20 #define NTL_FFT_RDUP (NTL_LG2_PDSZ+3)
21 #define NTL_PDSZ (1 << NTL_LG2_PDSZ)
22
23 #else
24
25 #define NTL_FFT_RDUP (4)
26 // Currently, this should be at least 2 to support
27 // loop unrolling in the FFT implementation
28
29 #endif
30
31 inline
32 long FFTRoundUp(long xn, long k)
33 {
34 long n = 1L << k;
35 if (xn <= 0) return n;
36 // default truncation value of 0 gets converted to n
37
38 xn = ((xn+((1L << NTL_FFT_RDUP)-1)) >> NTL_FFT_RDUP) << NTL_FFT_RDUP;
39
40 if (k >= 10) {
41 if (xn > n - (n >> 4)) xn = n;
42 }
43 else {
44 if (xn > n - (n >> 3)) xn = n;
45 }
46 // truncation just a bit below n does not really help
47 // at all, and can sometimes slow things down slightly, so round up
48 // to n. This also takes care of cases where xn > n.
49 // Actually, for smallish n, we should round up sooner,
50 // at n-n/8, and for larger n, we should round up later,
51 // at n-m/16. At least, experimentally, this is what I see.
52
53 return xn;
54 }
55
56
57 NTL_CLOSE_NNS
58
59 #endif
2525 long KarCross;
2626 long ModCross;
2727 long DivCross;
28 long GCDCross;
2829
2930 long _card_exp;
3031 Lazy<ZZ> _card;
140141 static long KarCross() { return GF2EInfo->KarCross; }
141142 static long ModCross() { return GF2EInfo->ModCross; }
142143 static long DivCross() { return GF2EInfo->DivCross; }
144 static long GCDCross() { return GF2EInfo->GCDCross; }
143145
144146 static long degree() { return GF2EInfo->p.n; }
145147
0 #ifndef NTL_PD__H
1 #define NTL_PD__H
2
3 #include <NTL/tools.h>
4 #include <immintrin.h>
5
6 NTL_OPEN_NNS
7
8
9 template<int N>
10 struct PD {
11 private:
12 PD();
13 };
14
15
16 // FIXME: should distinguish more carefully:
17 // AVX512DQ for long/double conversions
18 // AVX512VL for certain ops applied to shorter types:
19 // long/double conversions and mask ops
20 // may need to translate long/double conversions for non-AVXDQ512
21
22
23
24 //=================== PD<8> implementation ===============
25
26 #ifdef NTL_HAVE_AVX512F
27
28 template<>
29 struct PD<8> {
30 __m512d data;
31
32 enum { size = 8};
33
34 PD() { }
35 PD(double x) : data(_mm512_set1_pd(x)) { }
36 PD(__m512d _data) : data(_data) { }
37
38 PD(double d0, double d1, double d2, double d3,
39 double d4, double d5, double d6, double d7)
40 : data(_mm512_set_pd(d7, d6, d5, d4, d3, d2, d1, d0)) { }
41
42 static PD load(const double *p) { return _mm512_load_pd(p); }
43
44 // load from unaligned address
45 static PD loadu(const double *p) { return _mm512_loadu_pd(p); }
46 };
47
48 inline void
49 load(PD<8>& x, const double *p)
50 { x = PD<8>::load(p); }
51
52 // load from unaligned address
53 inline void
54 loadu(PD<8>& x, const double *p)
55 { x = PD<8>::loadu(p); }
56
57 inline void
58 store(double *p, PD<8> a)
59 { _mm512_store_pd(p, a.data); }
60
61 // store to unaligned address
62 inline void
63 storeu(double *p, PD<8> a)
64 { _mm512_storeu_pd(p, a.data); }
65
66 // load and convert
67 inline void
68 load(PD<8>& x, const long *p)
69 { __m512i a = _mm512_load_epi64(p); x = _mm512_cvtepi64_pd(a); }
70
71 // load unaligned and convert
72 inline void
73 loadu(PD<8>& x, const long *p)
74 { __m512i a = _mm512_loadu_si512(p); x = _mm512_cvtepi64_pd(a); }
75
76 // convert and store
77 inline void
78 store(long *p, PD<8> a)
79 { __m512i b = _mm512_cvtpd_epi64(a.data); _mm512_store_epi64(p, b); }
80
81 // convert and store unaligned
82 inline void
83 storeu(long *p, PD<8> a)
84 { __m512i b = _mm512_cvtpd_epi64(a.data); _mm512_storeu_si512(p, b); }
85
86
87 // swap even/odd slots
88 // e.g., 01234567 -> 10325476
89 inline PD<8>
90 swap2(PD<8> a)
91 { return _mm512_permute_pd(a.data, 0x55); }
92
93 // swap even/odd slot-pairs
94 // e.g., 01234567 -> 23016745
95 inline PD<8>
96 swap4(PD<8> a)
97 { return _mm512_permutex_pd(a.data, 0x4e); }
98
99 // 01234567 -> 00224466
100 inline PD<8>
101 dup2even(PD<8> a)
102 { return _mm512_permute_pd(a.data, 0); }
103
104 // 01234567 -> 11335577
105 inline PD<8>
106 dup2odd(PD<8> a)
107 { return _mm512_permute_pd(a.data, 0xff); }
108
109 // 01234567 -> 01014545
110 inline PD<8>
111 dup4even(PD<8> a)
112 { return _mm512_permutex_pd(a.data, 0x44); }
113
114 // 01234567 -> 23236767
115 inline PD<8>
116 dup4odd(PD<8> a)
117 { return _mm512_permutex_pd(a.data, 0xee); }
118
119 // blend even/odd slots
120 // 01234567, 89abcdef -> 092b4d6f
121 inline PD<8>
122 blend2(PD<8> a, PD<8> b)
123 { return _mm512_mask_blend_pd(0xaa, a.data, b.data); }
124 // FIXME: why isn't there an intrinsic that doesn't require a mask register?
125
126 // blend even/odd slot-pairs
127 // 01234567, 89abcdef -> 01ab45ef
128 inline PD<8>
129 blend4(PD<8> a, PD<8> b)
130 { return _mm512_mask_blend_pd(0xcc, a.data, b.data); }
131 // FIXME: why isn't there an intrinsic that doesn't require a mask register?
132
133 // res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
134 inline PD<8>
135 correct_excess(PD<8> a, PD<8> b)
136 {
137 __mmask8 k = _mm512_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
138 return _mm512_mask_sub_pd(a.data, k, a.data, b.data);
139 }
140
141 // res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
142 inline PD<8>
143 correct_deficit(PD<8> a, PD<8> b)
144 {
145 __mmask8 k = _mm512_cmp_pd_mask(a.data, _mm512_setzero_pd(), _CMP_LT_OQ);
146 return _mm512_mask_add_pd(a.data, k, a.data, b.data);
147 }
148
149 inline void
150 clear(PD<8>& x)
151 { x.data = _mm512_setzero_pd(); }
152
153 inline PD<8>
154 operator+(PD<8> a, PD<8> b)
155 { return _mm512_add_pd(a.data, b.data); }
156
157 inline PD<8>
158 operator-(PD<8> a, PD<8> b)
159 { return _mm512_sub_pd(a.data, b.data); }
160
161 inline PD<8>
162 operator*(PD<8> a, PD<8> b)
163 { return _mm512_mul_pd(a.data, b.data); }
164
165 inline PD<8>
166 operator/(PD<8> a, PD<8> b)
167 { return _mm512_div_pd(a.data, b.data); }
168
169 inline PD<8>&
170 operator+=(PD<8>& a, PD<8> b)
171 { a = a + b; return a; }
172
173 inline PD<8>&
174 operator-=(PD<8>& a, PD<8> b)
175 { a = a - b; return a; }
176
177 inline PD<8>&
178 operator*=(PD<8>& a, PD<8> b)
179 { a = a * b; return a; }
180
181 inline PD<8>&
182 operator/=(PD<8>& a, PD<8> b)
183 { a = a / b; return a; }
184
185 // a*b+c (fused)
186 inline PD<8>
187 fused_muladd(PD<8> a, PD<8> b, PD<8> c)
188 { return _mm512_fmadd_pd(a.data, b.data, c.data); }
189
190 // a*b-c (fused)
191 inline PD<8>
192 fused_mulsub(PD<8> a, PD<8> b, PD<8> c)
193 { return _mm512_fmsub_pd(a.data, b.data, c.data); }
194
195 // -a*b+c (fused)
196 inline PD<8>
197 fused_negmuladd(PD<8> a, PD<8> b, PD<8> c)
198 { return _mm512_fnmadd_pd(a.data, b.data, c.data); }
199
200 #endif
201
202 //=================== PD<4> implementation ===============
203
204 #if (defined(NTL_HAVE_AVX2) && defined(NTL_HAVE_FMA))
205
206 template<>
207 struct PD<4> {
208 __m256d data;
209
210 enum { size = 4};
211
212 PD() { }
213 PD(double x) : data(_mm256_set1_pd(x)) { }
214 PD(__m256d _data) : data(_data) { }
215 PD(double d0, double d1, double d2, double d3)
216 : data(_mm256_set_pd(d3, d2, d1, d0)) { }
217
218 static PD load(const double *p) { return _mm256_load_pd(p); }
219
220 // load from unaligned address
221 static PD loadu(const double *p) { return _mm256_loadu_pd(p); }
222 };
223
224 inline void
225 load(PD<4>& x, const double *p)
226 { x = PD<4>::load(p); }
227
228 // load from unaligned address
229 inline void
230 loadu(PD<4>& x, const double *p)
231 { x = PD<4>::loadu(p); }
232
233 inline void
234 store(double *p, PD<4> a)
235 { _mm256_store_pd(p, a.data); }
236
237 // store to unaligned address
238 inline void
239 storeu(double *p, PD<4> a)
240 { _mm256_storeu_pd(p, a.data); }
241
242
243
244
245
246 // The following assume all numbers are integers
247 // in the range [0, 2^52). The idea is taken from here:
248 // https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
249
250
251 // Some of the Intel intrinsics for loading and storing packed
252 // integers from memory require casting between long* and __m256i*.
253 // Strictly speaking, this can break strict aliasing rules, but
254 // this is hopefully not a problem.
255 // See discussion here:
256 // https://stackoverflow.com/questions/24787268/how-to-implement-mm-storeu-epi64-without-aliasing-problems
257
258
259 // load and convert
260 inline void
261 load(PD<4>& x, const long *p)
262 {
263 #ifdef NTL_HAVE_AVX512F
264 __m256i a = _mm256_load_si256((const __m256i*)p);
265 x = _mm256_cvtepi64_pd(a);
266 #else
267 __m256i a = _mm256_load_si256((const __m256i*)p);
268 a = _mm256_or_si256(a, _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
269 x = _mm256_sub_pd(_mm256_castsi256_pd(a), _mm256_set1_pd(1L << 52));
270 #endif
271 }
272
273 // load unaligned and convert
274 inline void
275 loadu(PD<4>& x, const long *p)
276 {
277 #ifdef NTL_HAVE_AVX512F
278 __m256i a = _mm256_loadu_si256((const __m256i*)p); x = _mm256_cvtepi64_pd(a);
279 #else
280 __m256i a = _mm256_loadu_si256((const __m256i*)p);
281 a = _mm256_or_si256(a, _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
282 x = _mm256_sub_pd(_mm256_castsi256_pd(a), _mm256_set1_pd(1L << 52));
283 #endif
284 }
285
286 // convert and store
287 inline void
288 store(long *p, PD<4> a)
289 {
290 #ifdef NTL_HAVE_AVX512F
291 __m256i b = _mm256_cvtpd_epi64(a.data);
292 #ifdef __clang__
293 _mm256_store_si256((__m256i*)p, b);
294 #else
295 // clang doesn't define this...why??
296 _mm256_store_epi64(p, b);
297 #endif
298 #else
299 __m256d x = a.data;
300 x = _mm256_add_pd(x, _mm256_set1_pd(1L << 52));
301 __m256i b = _mm256_xor_si256(
302 _mm256_castpd_si256(x),
303 _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
304 _mm256_store_si256((__m256i*)p, b);
305 #endif
306 }
307
308 // convert and store unaligned
309 inline void
310 storeu(long *p, PD<4> a)
311 {
312 #ifdef NTL_HAVE_AVX512F
313 __m256i b = _mm256_cvtpd_epi64(a.data);
314 _mm256_storeu_si256((__m256i*)p, b);
315 #else
316 __m256d x = a.data;
317 x = _mm256_add_pd(x, _mm256_set1_pd(1L << 52));
318 __m256i b = _mm256_xor_si256(
319 _mm256_castpd_si256(x),
320 _mm256_castpd_si256(_mm256_set1_pd(1L << 52)));
321 _mm256_storeu_si256((__m256i*)p, b);
322 #endif
323 }
324
325
326 // swap even/odd slots
327 // e.g., 0123 -> 1032
328 inline PD<4>
329 swap2(PD<4> a)
330 { return _mm256_permute_pd(a.data, 0x5); }
331
332 // 0123 -> 0022
333 inline PD<4>
334 dup2even(PD<4> a)
335 { return _mm256_permute_pd(a.data, 0); }
336
337 // 0123 -> 1133
338 inline PD<4>
339 dup2odd(PD<4> a)
340 { return _mm256_permute_pd(a.data, 0xf); }
341
342 // blend even/odd slots
343 // 0123, 4567 -> 0527
344 inline PD<4>
345 blend2(PD<4> a, PD<4> b)
346 { return _mm256_blend_pd(a.data, b.data, 0xa); }
347
348 // res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
349 inline PD<4>
350 correct_excess(PD<4> a, PD<4> b)
351 {
352 #ifdef NTL_HAVE_AVX512F
353 __mmask8 k = _mm256_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
354 return _mm256_mask_sub_pd(a.data, k, a.data, b.data);
355 #else
356 __m256d mask = _mm256_cmp_pd(a.data, b.data, _CMP_GE_OQ);
357 __m256d corrected = _mm256_sub_pd(a.data, b.data);
358 return _mm256_blendv_pd(a.data, corrected, mask);
359 #endif
360 }
361
362 // res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
363 inline PD<4>
364 correct_deficit(PD<4> a, PD<4> b)
365 {
366 #ifdef NTL_HAVE_AVX512F
367 __mmask8 k = _mm256_cmp_pd_mask(a.data, _mm256_setzero_pd(), _CMP_LT_OQ);
368 return _mm256_mask_add_pd(a.data, k, a.data, b.data);
369 #else
370 __m256d mask = _mm256_cmp_pd(a.data, _mm256_setzero_pd(), _CMP_LT_OQ);
371 __m256d corrected = _mm256_add_pd(a.data, b.data);
372 return _mm256_blendv_pd(a.data, corrected, mask);
373 #endif
374 }
375
376 inline void
377 clear(PD<4>& x)
378 { x.data = _mm256_setzero_pd(); }
379
380 inline PD<4>
381 operator+(PD<4> a, PD<4> b)
382 { return _mm256_add_pd(a.data, b.data); }
383
384 inline PD<4>
385 operator-(PD<4> a, PD<4> b)
386 { return _mm256_sub_pd(a.data, b.data); }
387
388 inline PD<4>
389 operator*(PD<4> a, PD<4> b)
390 { return _mm256_mul_pd(a.data, b.data); }
391
392 inline PD<4>
393 operator/(PD<4> a, PD<4> b)
394 { return _mm256_div_pd(a.data, b.data); }
395
396 inline PD<4>&
397 operator+=(PD<4>& a, PD<4> b)
398 { a = a + b; return a; }
399
400 inline PD<4>&
401 operator-=(PD<4>& a, PD<4> b)
402 { a = a - b; return a; }
403
404 inline PD<4>&
405 operator*=(PD<4>& a, PD<4> b)
406 { a = a * b; return a; }
407
408 inline PD<4>&
409 operator/=(PD<4>& a, PD<4> b)
410 { a = a / b; return a; }
411
412 // a*b+c (fused)
413 inline PD<4>
414 fused_muladd(PD<4> a, PD<4> b, PD<4> c)
415 { return _mm256_fmadd_pd(a.data, b.data, c.data); }
416
417 // a*b-c (fused)
418 inline PD<4>
419 fused_mulsub(PD<4> a, PD<4> b, PD<4> c)
420 { return _mm256_fmsub_pd(a.data, b.data, c.data); }
421
422 // -a*b+c (fused)
423 inline PD<4>
424 fused_negmuladd(PD<4> a, PD<4> b, PD<4> c)
425 { return _mm256_fnmadd_pd(a.data, b.data, c.data); }
426
427
428 //=================== PD<2> implementation ===============
429
430
431 template<>
432 struct PD<2> {
433 __m128d data;
434
435 enum { size = 2};
436
437 PD() { }
438 PD(double x) : data(_mm_set1_pd(x)) { }
439 PD(__m128d _data) : data(_data) { }
440 PD(double d0, double d1)
441 : data(_mm_set_pd(d1, d0)) { }
442
443 static PD load(const double *p) { return _mm_load_pd(p); }
444
445 // load from unaligned address
446 static PD loadu(const double *p) { return _mm_loadu_pd(p); }
447 };
448
449 inline void
450 load(PD<2>& x, const double *p)
451 { x = PD<2>::load(p); }
452
453 // load from unaligned address
454 inline void
455 loadu(PD<2>& x, const double *p)
456 { x = PD<2>::loadu(p); }
457
458 inline void
459 store(double *p, PD<2> a)
460 { _mm_store_pd(p, a.data); }
461
462 // store to unaligned address
463 inline void
464 storeu(double *p, PD<2> a)
465 { _mm_storeu_pd(p, a.data); }
466
467
468
469
470
471 // The following assume all numbers are integers
472 // in the range [0, 2^52). The idea is taken from here:
473 // https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
474
475 // load and convert
476 inline void
477 load(PD<2>& x, const long *p)
478 {
479 #ifdef NTL_HAVE_AVX512F
480 __m128i a = _mm_load_si128((const __m128i*)p);
481 x = _mm_cvtepi64_pd(a);
482 #else
483 __m128i a = _mm_load_si128((const __m128i*)p);
484 a = _mm_or_si128(a, _mm_castpd_si128(_mm_set1_pd(1L << 52)));
485 x = _mm_sub_pd(_mm_castsi128_pd(a), _mm_set1_pd(1L << 52));
486 #endif
487 }
488
489 // load unaligned and convert
490 inline void
491 loadu(PD<2>& x, const long *p)
492 {
493 #ifdef NTL_HAVE_AVX512F
494 __m128i a = _mm_loadu_si128((const __m128i*)p); x = _mm_cvtepi64_pd(a);
495 #else
496 __m128i a = _mm_loadu_si128((const __m128i*)p);
497 a = _mm_or_si128(a, _mm_castpd_si128(_mm_set1_pd(1L << 52)));
498 x = _mm_sub_pd(_mm_castsi128_pd(a), _mm_set1_pd(1L << 52));
499 #endif
500 }
501
502 // convert and store
503 inline void
504 store(long *p, PD<2> a)
505 {
506 #ifdef NTL_HAVE_AVX512F
507 __m128i b = _mm_cvtpd_epi64(a.data);
508 #ifdef __clang__
509 _mm_store_si128((__m128i*)p, b);
510 #else
511 // clang doesn't define this...why??
512 _mm_store_epi64(p, b);
513 #endif
514 #else
515 __m128d x = a.data;
516 x = _mm_add_pd(x, _mm_set1_pd(1L << 52));
517 __m128i b = _mm_xor_si128(
518 _mm_castpd_si128(x),
519 _mm_castpd_si128(_mm_set1_pd(1L << 52)));
520 _mm_store_si128((__m128i*)p, b);
521 #endif
522 }
523
524 // convert and store unaligned
525 inline void
526 storeu(long *p, PD<2> a)
527 {
528 #ifdef NTL_HAVE_AVX512F
529 __m128i b = _mm_cvtpd_epi64(a.data);
530 _mm_storeu_si128((__m128i*)p, b);
531 #else
532 __m128d x = a.data;
533 x = _mm_add_pd(x, _mm_set1_pd(1L << 52));
534 __m128i b = _mm_xor_si128(
535 _mm_castpd_si128(x),
536 _mm_castpd_si128(_mm_set1_pd(1L << 52)));
537 _mm_storeu_si128((__m128i*)p, b);
538 #endif
539 }
540
541
542 // res[i] = a[i] < b[i] ? a[i] : a[i]-b[i]
543 inline PD<2>
544 correct_excess(PD<2> a, PD<2> b)
545 {
546 #ifdef NTL_HAVE_AVX512F
547 __mmask8 k = _mm_cmp_pd_mask(a.data, b.data, _CMP_GE_OQ);
548 return _mm_mask_sub_pd(a.data, k, a.data, b.data);
549 #else
550 __m128d mask = _mm_cmp_pd(a.data, b.data, _CMP_GE_OQ);
551 __m128d corrected = _mm_sub_pd(a.data, b.data);
552 return _mm_blendv_pd(a.data, corrected, mask);
553 #endif
554 }
555
556 // res[i] = a[i] >= 0 ? a[i] : a[i]+b[i]
557 inline PD<2>
558 correct_deficit(PD<2> a, PD<2> b)
559 {
560 #ifdef NTL_HAVE_AVX512F
561 __mmask8 k = _mm_cmp_pd_mask(a.data, _mm_setzero_pd(), _CMP_LT_OQ);
562 return _mm_mask_add_pd(a.data, k, a.data, b.data);
563 #else
564 __m128d mask = _mm_cmp_pd(a.data, _mm_setzero_pd(), _CMP_LT_OQ);
565 __m128d corrected = _mm_add_pd(a.data, b.data);
566 return _mm_blendv_pd(a.data, corrected, mask);
567 #endif
568 }
569
570 inline void
571 clear(PD<2>& x)
572 { x.data = _mm_setzero_pd(); }
573
574 inline PD<2>
575 operator+(PD<2> a, PD<2> b)
576 { return _mm_add_pd(a.data, b.data); }
577
578 inline PD<2>
579 operator-(PD<2> a, PD<2> b)
580 { return _mm_sub_pd(a.data, b.data); }
581
582 inline PD<2>
583 operator*(PD<2> a, PD<2> b)
584 { return _mm_mul_pd(a.data, b.data); }
585
586 inline PD<2>
587 operator/(PD<2> a, PD<2> b)
588 { return _mm_div_pd(a.data, b.data); }
589
590 inline PD<2>&
591 operator+=(PD<2>& a, PD<2> b)
592 { a = a + b; return a; }
593
594 inline PD<2>&
595 operator-=(PD<2>& a, PD<2> b)
596 { a = a - b; return a; }
597
598 inline PD<2>&
599 operator*=(PD<2>& a, PD<2> b)
600 { a = a * b; return a; }
601
602 inline PD<2>&
603 operator/=(PD<2>& a, PD<2> b)
604 { a = a / b; return a; }
605
606 // a*b+c (fused)
607 inline PD<2>
608 fused_muladd(PD<2> a, PD<2> b, PD<2> c)
609 { return _mm_fmadd_pd(a.data, b.data, c.data); }
610
611 // a*b-c (fused)
612 inline PD<2>
613 fused_mulsub(PD<2> a, PD<2> b, PD<2> c)
614 { return _mm_fmsub_pd(a.data, b.data, c.data); }
615
616 // -a*b+c (fused)
617 inline PD<2>
618 fused_negmuladd(PD<2> a, PD<2> b, PD<2> c)
619 { return _mm_fnmadd_pd(a.data, b.data, c.data); }
620
621
622
623
624 //================== PD<8>/PD<4> conversions ================
625
626 #ifdef NTL_HAVE_AVX512F
627
628 // 0123, 4567 -> 01234567
629 inline PD<8>
630 join(PD<4> a, PD<4> b)
631 {
632 __m512d c = _mm512_castpd256_pd512(a.data);
633 return _mm512_insertf64x4(c, b.data, 1);
634 }
635
636 // 01234567 -> 0123
637 inline PD<4>
638 get_lo(PD<8> a)
639 { return _mm512_extractf64x4_pd(a.data, 0); }
640
641 // 01234567 -> 4567
642 inline PD<4>
643 get_hi(PD<8> a)
644 { return _mm512_extractf64x4_pd(a.data, 1); }
645
646 #endif
647
648 //================== PD<4>/PD<2> conversions ================
649
650 // 01, 23 -> 0123
651 inline PD<4>
652 join(PD<2> a, PD<2> b)
653 #if 0
654 // some versions of gcc are buggy and don't define this function
655 { return _mm256_set_m128d(b.data, a.data); }
656 #else
657 { return _mm256_insertf128_pd(_mm256_castpd128_pd256(a.data), b.data, 1); }
658 #endif
659
660
661 // 0123 -> 01
662 inline PD<2>
663 get_lo(PD<4> a)
664 { return _mm256_extractf128_pd(a.data, 0); }
665
666 // 0123 -> 23
667 inline PD<2>
668 get_hi(PD<4> a)
669 { return _mm256_extractf128_pd(a.data, 1); }
670
671
672 #endif
673
674
675 NTL_CLOSE_NNS
676
677
678 #endif
3030 std::cerr << "NTL_HAVE_FMA\n";
3131 #endif
3232
33 #ifdef NTL_HAVE_AVX512F
34 std::cerr << "NTL_HAVE_AVX512F\n";
35 #endif
36
3337 #ifdef NTL_HAVE_COPY_TRAITS1
3438 std::cerr << "NTL_HAVE_COPY_TRAITS1\n";
3539 #endif
3535
3636 #ifndef NTL_WordVectorMinAlloc
3737 #define NTL_WordVectorMinAlloc (4)
38 #endif
39
40 // vectors are always expanded by at least this ratio
41
42 #ifndef NTL_WordVectorExpansionRatio
43 #define NTL_WordVectorExpansionRatio (1.4)
4438 #endif
4539
4640 // controls initialization during input
391391 inline void add(ZZ& x, long a, const ZZ& b) { add(x, b, a); }
392392
393393
394 void sub(ZZ& x, const ZZ& a, long b);
394 inline void sub(ZZ& x, const ZZ& a, long b)
395 { _ntl_gssub(a.rep, b, &x.rep); }
396
395397 void sub(ZZ& x, long a, const ZZ& b);
398 // defined in ZZ.cpp
396399
397400 /* operator/function notation */
398401
17561759
17571760 void InvModError(const char *s, const ZZ& a, const ZZ& n);
17581761
1762 #ifdef NTL_PROVIDES_SS_LIP_IMPL
1763
1764 inline void
1765 LeftRotate_lip_impl(ZZ& a, const ZZ& b, long e, const ZZ& p, long n, ZZ& scratch)
1766 // Compute a = b * 2^e mod p, where p = 2^n+1. 0<=e<n and 0<b<p
1767 // a may not alias p
1768 // scratch may not alias a, b, or p
1769 {
1770 _ntl_leftrotate(&a.rep, &b.rep, e, p.rep, n, &scratch.rep);
1771 }
1772
1773 inline void
1774 SS_AddMod_lip_impl(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
1775 // x = a + b mod p, where p = 2^n+1, a, b in [0, p).
1776 // x may not alias p.
1777 {
1778 _ntl_ss_addmod(&x.rep, &a.rep, &b.rep, p.rep, n);
1779 }
1780
1781 inline void
1782 SS_SubMod_lip_impl(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
1783 // x = a + b mod p, where p = 2^n+1, a, b in [0, p).
1784 // x may not alias b or p.
1785 {
1786 _ntl_ss_submod(&x.rep, &a.rep, &b.rep, p.rep, n);
1787 }
1788
1789 #endif
1790
1791
1792
1793
1794
17591795 NTL_CLOSE_NNS
17601796
17611797
561561 public:
562562 long k; // a 2^k point representation
563563 long MaxK; // maximum space allocated
564 long len; // length of truncated FFT
564565 long NumPrimes;
565566 Unique2DArray<long> tbl;
566567
567 FFTRep() : k(-1), MaxK(-1), NumPrimes(0) { }
568
569 FFTRep(const FFTRep& R) : k(-1), MaxK(-1), NumPrimes(0)
568 FFTRep() : k(-1), MaxK(-1), len(0), NumPrimes(0) { }
569
570 FFTRep(const FFTRep& R) : k(-1), MaxK(-1), len(0), NumPrimes(0)
570571 { *this = R; }
571572
572 FFTRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), NumPrimes(0)
573 FFTRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), len(0), NumPrimes(0)
573574 { SetSize(InitK); }
574575
575576 FFTRep& operator=(const FFTRep& R);
578579 };
579580
580581
581 void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi);
582 void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len,
583 long lo, long hi);
584
585 inline void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len)
586 { ToFFTRep_trunc(y, x, k, len, 0, deg(x)); }
587
588 inline
589 void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
582590 // computes an n = 2^k point convolution of x[lo..hi].
591 { ToFFTRep_trunc(y, x, k, 0, lo, hi); }
583592
584593 inline void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k)
585594
224224
225225 #endif
226226
227 #if 1
228 #define NTL_ENABLE_AVX_FFT
229
230 /*
231 * This will compile NTL in a way that enables an AVX implemention
232 * of the small-prime FFT.
233 */
234
235 #endif
236
237
238 #if 0
239 #define NTL_AVOID_AVX512
240
241 /*
242 * This will compile NTL in a way that avoids 512-bit operations,
243 * even if AVX512 is available.
244 */
245
246 #endif
227247
228248 #if 0
229249 #define NTL_RANGE_CHECK
88
99 #include <NTL/PackageInfo.h>
1010
11
12 #if (!defined(NTL_HAVE_LL_TYPE) && defined(_MSC_VER) && defined(NTL_WINPACK))
13 // for the windows distribution, for MSVC++ we assume LL_TYPE works
11 #if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
12 #define NTL_GNUC_INTEL
13 #endif
14
15 #if (!defined(NTL_HAVE_LL_TYPE) && defined(NTL_WINPACK) && (defined(_MSC_VER) || defined(NTL_GNUC_INTEL)))
16 // for the windows distribution,
17 // we assume LL_TYPE works for MSVC++ (which is true for both x86 and ARM)
18 // and for GNUC/Intel platforms (e.g., Code Blocks)
1419 #define NTL_HAVE_LL_TYPE
1520 #endif
1621
503508
504509 #define NTL_AVX_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX_BYTE_ALIGN, x, type, n)
505510
511 #define NTL_AVX512_BYTE_ALIGN (64)
512
513 #define NTL_AVX512_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX512_BYTE_ALIGN, x, type, n)
514
515
506516 #define NTL_DEFAULT_ALIGN (64)
507517 // this should be big enough to satisfy any SIMD instructions,
508518 // and it should also be as big as a cache line
572582 }
573583
574584
585 // vectors are grown by a factor of 1.5
586 inline long _ntl_vec_grow(long n)
587 { return n + n/2; }
588
575589
576590 template <class T>
577591 struct _ntl_is_char_pointer
604618
605619
606620
607
608 #endif
621 #endif
1313 * but better for debugging.
1414 */
1515
16 struct _ntl_gbigint_body;
16 struct _ntl_gbigint_body {
17 long alloc_;
18 long size_;
19 };
20
1721 typedef _ntl_gbigint_body *_ntl_gbigint;
1822
1923
3842 #endif
3943
4044
41
42
43 #elif (NTL_LONGDOUBLE_OK && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGDOUBLE))
45 #if (defined(NTL_ENABLE_AVX_FFT) && (NTL_SP_NBITS > 50))
46 #undef NTL_SP_NBITS
47 #define NTL_SP_NBITS (50)
48 #endif
49
50
51 #elif (NTL_LONGDOUBLE_OK && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGDOUBLE) && !defined(NTL_ENABLE_AVX_FFT))
4452
4553 #define NTL_LONGDOUBLE_SP_MULMOD
4654
142150 // DIRT: These are copied from lip.cpp file
143151
144152 inline long& _ntl_ALLOC(_ntl_gbigint p)
145 { return (((long *) p)[0]); }
153 { return p->alloc_; }
146154
147155 inline long& _ntl_SIZE(_ntl_gbigint p)
148 { return (((long *) p)[1]); }
156 { return p->size_; }
149157
150158 inline long _ntl_ZEROP(_ntl_gbigint p)
151159 {
166174
167175 void _ntl_gsadd(_ntl_gbigint a, long d, _ntl_gbigint *b);
168176 /* *b = a + d */
177
178 void _ntl_gssub(_ntl_gbigint a, long d, _ntl_gbigint *b);
179 /* *b = a - d */
169180
170181 void _ntl_gadd(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *c);
171182 /* *c = a + b */
665676 void
666677 _ntl_quick_accum_end(_ntl_gbigint x);
667678
668 #endif
679 // special-purpose routines for SSMul in ZZX
680
681 #if (defined(NTL_GMP_LIP) && (NTL_ZZ_NBITS & (NTL_ZZ_NBITS-1)) == 0)
682 // NOTE: the test (NTL_ZZ_NBITS & (NTL_ZZ_NBITS-1)) == 0
683 // effectively checks that NTL_ZZ_NBITS is a power of two
684
685 #define NTL_PROVIDES_SS_LIP_IMPL
686
687 void
688 _ntl_leftrotate(_ntl_gbigint *a, const _ntl_gbigint *b, long e,
689 _ntl_gbigint p, long n, _ntl_gbigint *scratch);
690
691 void
692 _ntl_ss_addmod(_ntl_gbigint *x, const _ntl_gbigint *a,
693 const _ntl_gbigint *b, _ntl_gbigint p, long n);
694 void
695 _ntl_ss_submod(_ntl_gbigint *x, const _ntl_gbigint *a,
696 const _ntl_gbigint *b, _ntl_gbigint p, long n);
697 #endif
698
699
700 #endif
202202
203203 static long storage() { return sizeof(long); }
204204
205 static bool IsFFTPrime() { return zz_pInfo->p_info != 0; }
206
205207 zz_p(long a, INIT_LOOP_HOLE_TYPE) { _zz_p__rep = a; }
206208
207209 // for consistency
571571 public:
572572 long k; // a 2^k point representation
573573 long MaxK; // maximum space allocated
574 long len; // length of truncated FFT
574575 long NumPrimes;
575576 UniqueArray<long> tbl[4];
576577
577 fftRep() : k(-1), MaxK(-1), NumPrimes(0) { }
578
579 fftRep(const fftRep& R) : k(-1), MaxK(-1), NumPrimes(0)
578 fftRep() : k(-1), MaxK(-1), len(0), NumPrimes(0) { }
579
580 fftRep(const fftRep& R) : k(-1), MaxK(-1), len(0), NumPrimes(0)
580581 { *this = R; }
581582
582 fftRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), NumPrimes(0)
583 fftRep(INIT_SIZE_TYPE, long InitK) : k(-1), MaxK(-1), len(0), NumPrimes(0)
583584 { SetSize(InitK); }
584585
585586 fftRep& operator=(const fftRep&);
588589 };
589590
590591
591 void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi);
592
593
594 void TofftRep_trunc(fftRep& y, const zz_pX& x, long k, long len,
595 long lo, long hi);
596
597 inline void TofftRep_trunc(fftRep& y, const zz_pX& x, long k, long len)
598 { TofftRep_trunc(y, x, k, len, 0, deg(x)); }
599
600 inline
601 void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
592602 // computes an n = 2^k point convolution of x[lo..hi].
603 { TofftRep_trunc(y, x, k, 0, lo, hi); }
593604
594605 inline void TofftRep(fftRep& y, const zz_pX& x, long k)
595606
0
1 #ifndef NTL_pd_FFT__H
2 #define NTL_pd_FFT__H
3
4
5 #include <NTL/tools.h>
6
7 NTL_OPEN_NNS
8
9
10 // Sets control register so that rounding mode
11 // is "down". Destructor restores control regsiter.
12 struct CSRPush {
13 unsigned int reg;
14 CSRPush();
15 ~CSRPush();
16 };
17
18
19 struct pd_mod_t {
20 double q;
21 const double **wtab;
22 const double **wqinvtab;
23 const double **wtab1;
24 const double **wqinvtab1;
25 };
26
27
28 void
29 pd_LazyPrepMulModPrecon_impl(double *bninv, const double *b, double n, long len);
30
31 void
32 pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
33 long yn, long xn);
34
35 void
36 pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
37 long yn, long xn, double fac);
38
39 void
40 pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN,
41 const pd_mod_t& mod, long yn);
42
43 void
44 pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN,
45 const pd_mod_t& mod, long yn, double fac);
46
47 NTL_CLOSE_NNS
48
49 #endif
8484
8585
8686
87 void quad_float_normalize(quad_float& z, const double& xhi, const double& xlo);
88
89 void quad_float_in_place_add(quad_float& x, const quad_float& y);
90 void quad_float_in_place_sub(quad_float& x, const quad_float& y);
91 void quad_float_in_place_mul(quad_float& x, const quad_float& y);
92 void quad_float_in_place_div(quad_float& x, const quad_float& y);
93
94 void quad_float_in_place_negate(quad_float& x);
95 void quad_float_in_place_sqrt(quad_float& y, double& c_ref);
96
97 void quad_float_PrecisionOK(long&, const double&);
98
99
100
101
87102 #if (NTL_BITS_PER_INT < NTL_DOUBLE_PRECISION)
88103
89104 inline quad_float to_quad_float(int n) { return quad_float(n, 0); }
101116
102117
103118
104 inline quad_float to_quad_float(double x) { return quad_float(TrueDouble(x), 0); }
119
120 // NOTE: for extended precision platforms, the call to TrueDouble
121 // should remove it
122 inline quad_float to_quad_float(double x)
123 { return quad_float(TrueDouble(x), 0); }
105124
106125 inline quad_float to_quad_float(float x)
107126 { return to_quad_float(double(x)); }
109128 inline quad_float& quad_float::operator=(double x)
110129 { *this = to_quad_float(x); return *this; }
111130
112 quad_float operator+(const quad_float&, const quad_float& );
131
132
133
134
135 inline quad_float& operator+= (quad_float& x, const quad_float& y)
136 { quad_float_in_place_add(x, y); return x; }
137 inline quad_float& operator-= (quad_float& x, const quad_float& y)
138 { quad_float_in_place_sub(x, y); return x; }
139 inline quad_float& operator*= (quad_float& x, const quad_float& y)
140 { quad_float_in_place_mul(x, y); return x; }
141 inline quad_float& operator/= (quad_float& x, const quad_float& y)
142 { quad_float_in_place_div(x, y); return x; }
143
144 inline quad_float operator-(const quad_float& x)
145 { quad_float xx = x; quad_float_in_place_negate(xx); return xx; }
146
147
148
149 inline quad_float operator+(const quad_float& x, const quad_float& y)
150 { quad_float xx = x; xx += y; return xx; }
151
152 inline quad_float operator-(const quad_float& x, const quad_float& y)
153 { quad_float xx = x; xx -= y; return xx; }
154
155 inline quad_float operator*(const quad_float& x, const quad_float& y)
156 { quad_float xx = x; xx *= y; return xx; }
157
158 inline quad_float operator/(const quad_float& x, const quad_float& y)
159 { quad_float xx = x; xx /= y; return xx; }
160
161
162
163
164
165
113166
114167 inline quad_float operator+(const quad_float& x, double y )
115168 { return x + to_quad_float(y); }
117170 inline quad_float operator+(double x, const quad_float& y)
118171 { return to_quad_float(x) + y; }
119172
120 quad_float operator-(const quad_float&, const quad_float& );
121
122173 inline quad_float operator-(const quad_float& x, double y )
123174 { return x - to_quad_float(y); }
124175
125176 inline quad_float operator-(double x, const quad_float& y)
126177 { return to_quad_float(x) - y; }
127178
128 quad_float operator*(const quad_float&, const quad_float& );
129
130179 inline quad_float operator*(const quad_float& x, double y )
131180 { return x * to_quad_float(y); }
132181
133182 inline quad_float operator*(double x, const quad_float& y)
134183 { return to_quad_float(x) * y; }
135184
136 quad_float operator/(const quad_float&, const quad_float& );
137
138185 inline quad_float operator/(const quad_float& x, double y )
139186 { return x / to_quad_float(y); }
140187
141188 inline quad_float operator/(double x, const quad_float& y)
142189 { return to_quad_float(x) / y; }
143190
144 quad_float operator-(const quad_float& x);
145
146 quad_float& operator+= (quad_float& x, const quad_float& y);
191
192
147193 inline quad_float& operator += (quad_float& x, double y)
148194 { x += to_quad_float(y); return x; }
149195
150 quad_float& operator-= (quad_float& x, const quad_float& y);
151196 inline quad_float& operator-= (quad_float& x, double y)
152197 { x -= to_quad_float(y); return x; }
153198
154 quad_float& operator*= (quad_float& x, const quad_float& y);
155199 inline quad_float& operator*= (quad_float& x, double y)
156200 { x *= to_quad_float(y); return x; }
157201
158 quad_float& operator/= (quad_float& x, const quad_float& y);
159202 inline quad_float& operator/= (quad_float& x, double y)
160203 { x /= to_quad_float(y); return x; }
161204
291334
292335 long IsFinite(quad_float *x);
293336
294 long PrecisionOK();
295337
296338 quad_float ldexp(const quad_float& x, long exp);
297339
432432 // on relative modern versions of gcc, we can
433433 // decalare "restricted" pointers in C++
434434
435 // we also can use __attribute__((always_inline))
436
437
435438 #define NTL_RESTRICT __restrict
439 #define NTL_ALWAYS_INLINE __attribute__((always_inline))
436440
437441 #else
438442
439443 #define NTL_RESTRICT
440
441 #endif
444 #define NTL_ALWAYS_INLINE
445
446 #endif
447
448
449
450
442451
443452 // A very lightly wrapped pointer than does nothing more than provide
444453 // auto cleanup in a destructor. Use the UniquePtr class (in SmartPtr.h)
808817 );
809818 }
810819
820 inline void
821 ll_add(ll_type& x, const ll_type& a)
822 {
823 __asm__ (
824 "addq %[alo],%[xlo] \n\t"
825 "adcq %[ahi],%[xhi]" :
826 [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
827 [ahi] "rm" (a.hi), [alo] "rm" (a.lo) :
828 "cc"
829 );
830 }
831
832
811833
812834
813835 // NOTE: an optimizing compiler will remove the conditional.
923945 x += a;
924946 }
925947
948 inline void
949 ll_add(ll_type& x, const ll_type& a)
950 {
951 x += a;
952 }
953
954
926955 // NOTE: shamt must be in the range 0..NTL_BITS_PER_LONG-1
927956 template<long shamt>
928957 unsigned long
9771006 #define NTL_DECLARE_RELOCATABLE_WHEN(x) \
9781007 constexpr bool DeclareRelocatableType x
9791008
980 #if (defined(NTL_HAVE_COPY_TRAITS1) || defined(_MSC_VER))
1009 #if (defined(NTL_HAVE_COPY_TRAITS1) || defined(NTL_WINPACK))
9811010
9821011
9831012 // This strategy is used on compilers that fully support C++11 type traits.
9871016 // Just to be on the safe side, I check for a trivial destructor.
9881017
9891018 // This strategy is checked in the CheckCOPY_TRAITS1.cpp program.
1019
1020 // We also use this strategy for the WINPACK distribution.
1021 // It should work on Windows with any compiler that properly supports C++11
9901022
9911023
9921024 template<class T>
3939 #define NTL_VectorMinAlloc (4)
4040 #endif
4141
42 // vectors are always expanded by at least this ratio
43
44 #ifndef NTL_VectorExpansionRatio
45 #define NTL_VectorExpansionRatio (1.4)
46 #endif
47
4842 // controls initialization during input
4943
5044 #ifndef NTL_VectorInputBlock
112106 guard.relax();
113107 }
114108
115 template<class T>
116 void BlockMoveConstructFromVec(T* p, long n, const T* q)
117 {
118 long i;
119
120 NTL_SCOPE(guard) { default_BlockDestroy(p, i); };
121
122 for (i = 0; i < n; i++)
123 (void) new(&p[i]) T(q[i]);
124
125 guard.relax();
126 }
127
128109
129110 template<class T>
130111 void BlockConstructFromVec(T* p, long n, const T* q) { default_BlockConstructFromVec(p, n, q); }
131
132112
133113
134114
681661 NTL_VEC_HEAD(_vec__rep)->fixed = 0;
682662 }
683663 else if (n > NTL_VEC_HEAD(_vec__rep)->alloc) {
684 m = max(n, long(NTL_VectorExpansionRatio*NTL_VEC_HEAD(_vec__rep)->alloc));
664 m = max(n, _ntl_vec_grow(NTL_VEC_HEAD(_vec__rep)->alloc));
685665 m = ((m+NTL_VectorMinAlloc-1)/NTL_VectorMinAlloc) * NTL_VectorMinAlloc;
686666
687667 ReAllocate(m, VecStrategy<NTL_RELOC_TAG>());
11 #ifndef NTL_version__H
22 #define NTL_version__H
33
4 #define NTL_VERSION "11.0.0"
4 #define NTL_VERSION "11.3.0"
55
66 #define NTL_MAJOR_VERSION (11)
7 #define NTL_MINOR_VERSION (0)
7 #define NTL_MINOR_VERSION (3)
88 #define NTL_REVISION (0)
99
1010 #endif
0
1 #include <NTL/ctools.h>
2
3 #include <cstdlib>
4 #include <immintrin.h>
5 #include <iostream>
6
7 // Ths actually checks for AVX512F+DQ+VL
8
9
10 #if (!defined(__GNUC__) || !defined(__x86_64__) || !defined(__AVX512F__))
11 #error "AVX512F not supported"
12 #endif
13
14 #if (!defined(__AVX512VL__) || !defined(__AVX512DQ__))
15 #error "AVX512F not supported"
16 #endif
17
18 #if (NTL_BITS_PER_LONG != 64 || NTL_BITS_PER_INT != 32 || NTL_DOUBLE_PRECISION != 53)
19 #error "AVX512F not supported"
20 // sanity check -- code that uses this feature also relies on this
21 #endif
22
23 #ifndef NTL_HAVE_ALIGNED_ARRAY
24 #error "AVX512F not supported"
25 #endif
26
27 using namespace std;
28
29
30 void fun(double * x, const double *a, const double *b)
31 {
32 __m512d xvec, avec, bvec, cvec;
33
34 avec = _mm512_load_pd(a);
35 bvec = _mm512_load_pd(b);
36 xvec = _mm512_load_pd(x);
37
38 xvec = _mm512_fmadd_pd(avec, bvec, xvec);
39
40 _mm512_store_pd(x, xvec);
41 }
42
43 void fun1(double *x, const long *p)
44 {
45 __m256i a = _mm256_load_si256((const __m256i*)p);
46 _mm256_store_pd(x, _mm256_cvtepi64_pd(a));
47 }
48
49
50 int main()
51 {
52 NTL_AVX512_LOCAL_ARRAY(vp, double, 24);
53
54 double *a = vp + 0*8;
55 double *b = vp + 1*8;
56 double *x = vp + 2*8;
57
58 a[0] = atoi("1");
59 a[1] = atoi("2");
60 a[2] = atoi("3");
61 a[3] = atoi("4");
62 a[4] = atoi("5");
63 a[5] = atoi("6");
64 a[6] = atoi("7");
65 a[7] = atoi("8");
66
67 b[0] = atoi("2");
68 b[1] = atoi("3");
69 b[2] = atoi("4");
70 b[3] = atoi("5");
71 b[4] = atoi("6");
72 b[5] = atoi("7");
73 b[6] = atoi("8");
74 b[7] = atoi("9");
75
76 x[0] = atoi("3");
77 x[1] = atoi("4");
78 x[2] = atoi("5");
79 x[3] = atoi("6");
80 x[4] = atoi("7");
81 x[5] = atoi("8");
82 x[6] = atoi("9");
83 x[7] = atoi("10");
84
85 fun(x, a, b);
86
87 NTL_AVX_LOCAL_ARRAY(lp, long, 4);
88 NTL_AVX_LOCAL_ARRAY(dp, double, 4);
89
90 lp[0] = atoi("1");
91 lp[1] = atoi("2");
92 lp[2] = atoi("3");
93 lp[3] = atoi("4");
94
95 fun1(dp, lp);
96
97 if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26 &&
98 x[4] == 37 && x[5] == 50 && x[6] == 65 && x[7] == 82 &&
99 dp[0] == 1 && dp[1] == 2 && dp[2] == 3 && dp[3] == 4)
100 return 0;
101 else
102 return -1;
103 }
104
105
106
3131
3232 #if (defined(NTL_THREADS) && defined(NTL_TLS_HACK))
3333
34 #warning "TLS_HACK=on"
3534
3635 namespace details_pthread {
3736
0 ntl-11.0.0
0 ntl-11.3.0
9797 cout << "NTL_SAFE_VECTORS\n";
9898 #endif
9999
100 #ifdef NTL_ENABLE_AVX_FFT
101 cout << "NTL_ENABLE_AVX_FFT\n";
102 #endif
103
104 #ifdef NTL_AVOID_AVX512
105 cout << "NTL_AVOID_AVX512\n";
106 #endif
107
100108 #ifdef NTL_RANGE_CHECK
101109 cout << "NTL_RANGE_CHECK\n";
102110 #endif
7777 'NTL_CLEAN_PTR' => 'on',
7878 'NTL_SAFE_VECTORS' => 'on',
7979 'NTL_RANGE_CHECK' => 'off',
80 'NTL_ENABLE_AVX_FFT' => 'off',
81 'NTL_AVOID_AVX512' => 'off',
8082
8183
8284 'NTL_SPMM_ULL' => 'off',
620622 ($config_info =~ /\((.*?),(.*?),(.*?)\)/) or die "Error: GenConfigInfo failed";
621623
622624 # convert to number
623 $language_standard += 0 or die "Error: GenConfigInfo failed";
625 $language_standard += 0 or Warning("__cplusplus not correctly defined");
624626
625627 print("compiler_name=$compiler_name\n");
626628 print("language_standard=$language_standard\n");
+2479
-1568
src/FFT.cpp less more
00
11 #include <NTL/FFT.h>
2 #include <NTL/FFT_impl.h>
3
4 #ifdef NTL_ENABLE_AVX_FFT
5 #include <NTL/SmartPtr.h>
6 #include <NTL/pd_FFT.h>
7 #endif
28
39
410 /********************************************************************
511
612 This is an implementation of a "small prime" FFT, which lies at the heart of
7 the ZZ_pX arithmetic, as well as some other applications.
8
9 The basic algorithm is loosely based on the routine in the Cormen, Leiserson,
10 Rivest, and Stein book on algorithms.
11
12
13 CACHE PERFORMANCE
14
15 Some attention has been paid to cache performance, but there is still more that
16 could be done.
17
18
19 The bit-reverse-copy (BRC) algorithm is a simple table-driven algorithm up to
20 a certain theshold, and then switches to the COBRA algorithm from Carter and
21 Gatlin, "Towards an optimal bit-reversal permutation algorithm", FOCS 1998.
22 I've found that COBRA helps, but not much: just 5-10%. I've also found that
23 getting rid of BRC altogether leads to another 5-10% improvement. These
24 numbers are based on experiments with 2^{17}- and 2^{19}-point FFTs, looping
25 over 50 different primes on a Core 2 Duo machine.
26
27 One could get rid of bit-reverse-copy altogether. The current FFT routines all
28 implement what is called Decimation-In-Time (DIT), which means that inputs are
29 bit reversed. One can also implement the FFT using Decimation-In-Frequency
30 (DIF), which means that the outputs are bit reversed. One can get rid of the
31 bit reversals for doing convolutions by simply doing the forward FFT using
32 DIF-FFT and and the reverse FFT using DIT-FFT. This would allow one to simply
33 eliminate all of the bit-reversal steps, which would lead to some nontrivial
34 savings. However, there are a few places in NTL where I rely on the ordering
35 of elements within an FFTRep to be their "natural ordering". The reduce and
36 AddExpand routines in ZZ_pX come to mind (which actually may become simpler),
37 along with like RevToFFTRep and RevFromFFTRep (which may be trickier). Anyway,
38 because BRC doesn't seem to be a big problem right now, it doesn't seem worth
39 worrying about this.
40
41
42 Within the FFT algorithm itself, I have not tried anything like Bailey's 4-step
43 algorithm. Maybe this should be tested. However, I somehow doubt that
44 anything more than modest gains will be achieved, since most modern processors
45 now employ a kind of memory prefetching technique, to keep the cache filled
46 with memory locations that are likely to be used next. Moreover, the FFT
47 algorithm used here accesses memory for the most part in small, sequential
48 strides, which meshes well with hardware prefetching. The paper "Algorithms to
49 Take Advantage of Hardware Prefetching" [Pan, Cherng, Dick, Ladner, Workshop on
50 Algorithm Engineering and Experiments, 2007] contains some interesting
51 experiments and useful background information. Anyway, there is still room for
52 more experimentation.
53
13 ZZ_pX and zz_pX arithmetic, and impacts many other applications as well
14 (such as arithmetic in ZZ_pEX, zz_pEX, and ZZX).
15
16 The algorithm is a Truncated FFT based on code originally developed by David
17 Harvey. David's code built on the single-precision modular multiplication
18 technique introduced in NTL many years ago, but also uses a "lazy
19 multiplication" technique, which reduces the number of "correction" steps that
20 need to be performed in each butterfly (see below for more details). It also
21 implements a version of the Truncated FFT algorithm introduced by Joris van der
22 Hoeven at ISSAC 2004. Also see "A cache-friendly truncated FFT", David Harvey,
23 Theoretical Computer Science Volume 410, Issues 27-29, 28 June 2009, Pages
24 2649-2658.
25
26 I have almost completely re-written David's original code to make it fit into
27 NTL's software framework; however, all all of the key logic is still based on
28 David's code. David's original code also implemented a 2D transformation which
29 is more cache friendly for *very* large transforms. However, my experimens
30 indicated this was only beneficial for transforms of size at least 2^20, and so
31 I did not incorporate this variant.
32
33 Here is the Copyright notice from David's original code:
34
35
36 ==============================================================================
37
38 fft62: a library for number-theoretic transforms
39
40 Copyright (C) 2013, David Harvey
41
42 All rights reserved.
43
44 Redistribution and use in source and binary forms, with or without
45 modification, are permitted provided that the following conditions are met:
46
47 * Redistributions of source code must retain the above copyright notice, this
48 list of conditions and the following disclaimer.
49 * Redistributions in binary form must reproduce the above copyright notice,
50 this list of conditions and the following disclaimer in the documentation
51 and/or other materials provided with the distribution.
52
53 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
54 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
56 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
57 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
59 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
60 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
61 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63
64
65 ==============================================================================
5466
5567
5668 SINGLE-PRECISION MODULAR ARITHMETIC
8193 necessary. To be more portable, some of these computations should really be
8294 done using unsigned arithmetic, but that is not so important here. Also, the
8395 adjustment steps can be replaced by simple non-branching instrictions sequences
84 involving SHIFT, AND, and ADD/SUB instructions. On many modern machines, this
85 is usually faster and NTL uses this non-branching strategy.
96 involving SHIFT, AND, and ADD/SUB instructions. On some modern machines, this
97 is usually faster and NTL uses this non-branching strategy. However, on other
98 machines (modern x86's are an example of this), conditional move instructions
99 can be used in place of branching, and this code can be faster than the
100 non-branching code. NTL's performance-tuning script will figure out the best
101 way to do this.
102
86103
87104 Other simple optimizations can be done, such as precomputing 1/double(n) when n
88105 remains fixed for many computations, as is often the case.
110127 details about it, but since then, it has come to be known as "Shoup
111128 multiplcation" in a few papers, so I'll accept that. :-) The paper "Faster
112129 arithmetic for number-theoretic transforms" [David Harvey, J. Symb. Comp. 60
113 (2014) 113–119] seems to be the first place where it is discussed in detail,
130 (2014)] seems to be the first place where it is discussed in detail,
114131 and Harvey's paper also contains some improvements which I discuss below.
115132
116133 The basic idea is that in many computations, not only n, but one of the
184201 not really be necessary (assuming that computing both high and low words of a
185202 doube-wprd product is no more expensive than just computing the low word).
186203 However, none of the compilers I've used have been able to perform that
187 optimization.
204 optimization (in NTL v11.1, I added code that hand-codes this optimization).
188205
189206
190207 64-BIT MACHINES
191208
192 FIXME: this discussion is out of date.
193209 Current versions of NTL use (by default) 60-bit moduli based
194210 on all-integer arithemtic.
195211
196212
197213 Prior to v9.0 of NTL, on 64 bits, the modulus n was restricted to 50 bits, in
198214 order to allow the use of double-precision techniques, as double's have 53 bits
199 of precisions. However, since the x86-64 is such an importnat target, and the
200 one can still access the old x87 FPU, which provided 64-bit precision, the
201 bound on n on such platforms is now 60 bits. Actually, 62 bits could be
202 supported, but other things (namely, the TBL_REM implementation in
203 lip.cpp) start to slow down if 62 bits are used, so 60 seems like a good
204 compromose. Currently, 60-bit moduli are available only when using gcc on
205 x86-64 machines, and when compiling NTL with GMP.
206
207 Now, the FPU-based multiplies are in fact a bit slower than the SSE-based
208 multiplies. However, with the preconditioned all-integer MulMod's now used
209 extensively on almost all critical paths within NTL, this does not really
210 matter, and in fact, many things get faster with the wider moduli, so overall,
211 it is a net performance gain.
215 of precision. However, NTL now supports 60-bit moduli. Actually, 62 bits can
216 be supported by setting the NTL_MAXIMIZE_SP_NBITS configuraton flag, but other
217 things (namely, the TBL_REM implementation in lip.cpp) start to slow down if 62
218 bits are used, so 60 seems like a good compromise. Currently, 60-bit moduli
219 are available only when compiling NTL with GMP, and when some kind of extended
220 integer of floating point arithmetic is available.
212221
213222
214223 FUTURE TRENDS
215224
216 In the future, I might also experiment with other MulMod techniques, such as
217 those described in "Improved Division by Invariant Integers" [Moeller,
218 Granlund, IEEE Transactions on Computers, June 2010]. This might allow for,
219 say, 60-bit moduli on 64-bit machines that don't have extended double
220 precision. It is not clear how the performance of this would compare with the
221 floating-point methods; however, it probably doesn't matter too much, as the
222 preconditioned MulMod's are the most important ones.
223
224 It might also be useful to go back and reconsider Montgomery multiplication, at
225 least for "internal" use, like the FFT. However, I doubt that this will help
226 significantly.
227
228 As mentioned above, it could be useful to experiment with more cache-friendly
229 variants of the FFT, like Bailey's 4-step method. I could also experiment with
230 using the DIF/DIT. This affects some code outside of FFT as well (in ZZ_pX and
231 zz_pX, like reduce and AddeExpand), but should not affect any documented
232 interfaces.
233
234 Another direction to consider is exploiting concurrency. Besides using
235 multiple cores to parallelize things at a higher level, it would be nice to
236 exploit newer SIMD instructions. Unfortunately, as of now (early 2015), these
237 don't seem to have the functionality I need. A 64-bit x 64-bit -> low order
238 64-bit instruction is supposed to be available soon in the new AVX-512
239 instruction set. That would be a good start, but I would really like to get
240 the high-order 64-bits too. Maybe that will come someday. In the mean time, it
241 might be fun to experiment with using the AVX-512 instructions that will be
242 available, which will at least allow at least a floating-point-based
243 implementation, or an all-integer implementation with emulated MulHi. I have
244 no idea how performance will compare.
245
225
226 * The following papers
227
228 https://eprint.iacr.org/2017/727
229 https://eprint.iacr.org/2016/504
230 https://eprint.iacr.org/2015/382
231
232 present FFTs that access the pre-computed tables in a somewhat more efficent
233 fashion, so that we only need to read from the tables O(n) times, rather than
234 O(n log n) times.
235
236 I've partially implemented this, and have gotten mixed results.
237 For smallish FFT's (below k=10 or 11), this code is somewhat slower.
238 For larger FFT's (say, k=17), I see a speedup of 3-10%.
246239
247240
248241 ********************************************************************/
249242
250243
251244
252
253
254
255
256 // #define NTL_BRC_TEST
257 // Flag to test the cost of "bit reverse copy"
258
259
260 #define NTL_FFT_BIGTAB_LIMIT (200)
261 #ifndef NTL_BRC_TEST
245 #define NTL_FFT_BIGTAB_LIMIT (180)
262246 #define NTL_FFT_BIGTAB_MAXROOT (17)
263 #else
264 #define NTL_FFT_BIGTAB_MAXROOT (25)
265 #endif
266 // big tables are only used for the first NTL_FFT_BIGTAB_LIMIT primes,
267 // and then only for k-values at most NTL_FFT_BIGTAB_MAXROOT
247 #define NTL_FFT_BIGTAB_MINROOT (7)
248
249 // table sizes are bounded by 2^bound, where
250 // bound = NTL_FFT_BIGTAB_MAXROOT-index/NTL_FFT_BIGTAB_LIMIT.
251 // Here, index is the index of an FFT prime, or 0 for a user FFT prime.
252 // If bound <= NTL_FFT_BIGTAB_MINROOT, then big tables are not used,
253 // so only the first
254 // (NTL_FFT_BIGTAB_MAXROOT-NTL_FFT_BIGTAB_MINROOT)*NTL_FFT_BIGTAB_LIMIT
255 // FFT primes will have big tables.
268256
269257 // NOTE: in newer versions of NTL (v9.1 and later), the BIGTAB
270258 // code is only about 5-15% faster than the non-BIGTAB code, so
271259 // this is not a great time/space trade-off.
260 // However, some futher optimizations may only be implemented
261 // if big tables are used.
272262
273263 // NOTE: NTL_FFT_BIGTAB_MAXROOT is set independently of the parameter
274264 // NTL_FFTMaxRoot defined in FFT.h (and which is typically 25).
279269
280270
281271 NTL_START_IMPL
272
273
274
275 class FFTVectorPair {
276 public:
277 Vec<long> wtab_precomp;
278 Vec<mulmod_precon_t> wqinvtab_precomp;
279 };
280
281 typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
282
283
284 #ifdef NTL_ENABLE_AVX_FFT
285 class pd_FFTVectorPair {
286 public:
287 AlignedArray<double> wtab_precomp;
288 AlignedArray<double> wqinvtab_precomp;
289 };
290
291 typedef LazyTable<pd_FFTVectorPair, NTL_FFTMaxRoot+1> pd_FFTMultipliers;
292 #endif
293
294
295
296 class FFTMulTabs {
297 public:
298
299 #ifndef NTL_ENABLE_AVX_FFT
300 long bound;
301 FFTMultipliers MulTab;
302 #else
303 pd_FFTMultipliers pd_MulTab[2];
304 #endif
305
306 };
307
308 void FFTMulTabsDeleterPolicy::deleter(FFTMulTabs *p) { delete p; }
309
282310
283311
284312 FFTTablesType FFTTables;
428456 }
429457
430458
431 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, bool bigtab)
432 {
433 mulmod_t qinv = PrepMulMod(q);
434
435 long mr = CalcMaxRoot(q);
436
437 info.q = q;
438 info.qinv = qinv;
439 info.qrecip = 1/double(q);
440 info.zz_p_context = 0;
441
442
443 info.RootTable[0].SetLength(mr+1);
444 info.RootTable[1].SetLength(mr+1);
445 info.TwoInvTable.SetLength(mr+1);
446 info.TwoInvPreconTable.SetLength(mr+1);
447
448 long *rt = &info.RootTable[0][0];
449 long *rit = &info.RootTable[1][0];
450 long *tit = &info.TwoInvTable[0];
451 mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
452
453 long j;
454 long t;
455
456 rt[mr] = w;
457 for (j = mr-1; j >= 0; j--)
458 rt[j] = MulMod(rt[j+1], rt[j+1], q);
459
460 rit[mr] = InvMod(w, q);
461 for (j = mr-1; j >= 0; j--)
462 rit[j] = MulMod(rit[j+1], rit[j+1], q);
463
464 t = InvMod(2, q);
465 tit[0] = 1;
466 for (j = 1; j <= mr; j++)
467 tit[j] = MulMod(tit[j-1], t, q);
468
469 for (j = 0; j <= mr; j++)
470 tipt[j] = PrepMulModPrecon(tit[j], q, qinv);
471
472 if (bigtab)
473 info.bigtab.make();
474 }
475459
476460
477461 #ifndef NTL_WIZARD_HACK
504488 long q, w;
505489 NextFFTPrime(q, w, i);
506490
507 bool bigtab = false;
491 long bigtab_index = -1;
508492
509493 #ifdef NTL_FFT_BIGTAB
510 if (i < NTL_FFT_BIGTAB_LIMIT)
511 bigtab = true;
494 bigtab_index = i;
512495 #endif
513496
514 InitFFTPrimeInfo(*info, q, w, bigtab);
497 InitFFTPrimeInfo(*info, q, w, bigtab_index);
515498 info->zz_p_context = Build_zz_pInfo(info.get());
516499 bld.move(info);
517500 }
518501
519502 } while (0);
520503 }
521
522
523
524
525
526 #define NTL_PIPELINE
527 // Define to gets some software pipelining...actually seems
528 // to help somewhat
529
530 #define NTL_LOOP_UNROLL
531 // Define to unroll some loops. Seems to help a little
532
533 // FIXME: maybe the above two should be tested by the wizard
534
535
536 static
537 long RevInc(long a, long k)
538 {
539 long j, m;
540
541 j = k;
542 m = 1L << (k-1);
543
544 while (j && (m & a)) {
545 a ^= m;
546 m >>= 1;
547 j--;
548 }
549 if (j) a ^= m;
550 return a;
551 }
552
553
554 // FIXME: This could potentially be shared across threads, using
555 // a "lazy table".
556 static inline
557 Vec<long> *get_brc_mem()
558 {
559 NTL_TLS_LOCAL_INIT(Vec< Vec<long> >, brc_mem_vec, (INIT_SIZE, NTL_FFTMaxRoot+1));
560 return brc_mem_vec.elts();
561 }
562
563
564
565 #if 0
566
567
568 static
569 void BitReverseCopy(long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
570 {
571 Vec<long> *brc_mem = get_brc_mem();
572
573 long n = 1L << k;
574 long* NTL_RESTRICT rev;
575 long i, j;
576
577 rev = brc_mem[k].elts();
578 if (!rev) {
579 brc_mem[k].SetLength(n);
580 rev = brc_mem[k].elts();
581 for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
582 rev[i] = j;
583 }
584
585 for (i = 0; i < n; i++)
586 A[rev[i]] = a[i];
587 }
588
589
590 static
591 void BitReverseCopy(unsigned long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
592 {
593 Vec<long> *brc_mem = get_brc_mem();
594
595 long n = 1L << k;
596 long* NTL_RESTRICT rev;
597 long i, j;
598
599 rev = brc_mem[k].elts();
600 if (!rev) {
601 brc_mem[k].SetLength(n);
602 rev = brc_mem[k].elts();
603 for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
604 rev[i] = j;
605 }
606
607 for (i = 0; i < n; i++)
608 A[rev[i]] = a[i];
609 }
610
611 #else
612
613
614
615 #define NTL_BRC_THRESH (11)
616 #define NTL_BRC_Q (5)
617
618 // Must have NTL_BRC_THRESH >= 2*NTL_BRC_Q
619 // Should also have (1L << (2*NTL_BRC_Q)) small enough
620 // so that we can fit that many long's into the cache
621
622
623 static
624 long *BRC_init(long k)
625 {
626 Vec<long> *brc_mem = get_brc_mem();
627
628 long n = (1L << k);
629 brc_mem[k].SetLength(n);
630 long *rev = brc_mem[k].elts();
631 long i, j;
632 for (i = 0, j = 0; i < n; i++, j = RevInc(j, k))
633 rev[i] = j;
634 return rev;
635 }
636
637
638 static
639 void BasicBitReverseCopy(long * NTL_RESTRICT B,
640 const long * NTL_RESTRICT A, long k)
641 {
642 Vec<long> *brc_mem = get_brc_mem();
643
644 long n = 1L << k;
645 long* NTL_RESTRICT rev;
646 long i, j;
647
648 rev = brc_mem[k].elts();
649 if (!rev) rev = BRC_init(k);
650
651 for (i = 0; i < n; i++)
652 B[rev[i]] = A[i];
653 }
654
655
656
657 static
658 void COBRA(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
659 {
660 Vec<long> *brc_mem = get_brc_mem();
661
662 NTL_TLS_LOCAL(Vec<long>, BRC_temp);
663
664 long q = NTL_BRC_Q;
665 long k1 = k - 2*q;
666 long * NTL_RESTRICT rev_k1, * NTL_RESTRICT rev_q;
667 long *NTL_RESTRICT T;
668 long a, b, c, a1, b1, c1;
669 long i, j;
670
671 rev_k1 = brc_mem[k1].elts();
672 if (!rev_k1) rev_k1 = BRC_init(k1);
673
674 rev_q = brc_mem[q].elts();
675 if (!rev_q) rev_q = BRC_init(q);
676
677 T = BRC_temp.elts();
678 if (!T) {
679 BRC_temp.SetLength(1L << (2*q));
680 T = BRC_temp.elts();
681 }
682
683 for (b = 0; b < (1L << k1); b++) {
684 b1 = rev_k1[b];
685 for (a = 0; a < (1L << q); a++) {
686 a1 = rev_q[a];
687 for (c = 0; c < (1L << q); c++)
688 T[(a1 << q) + c] = A[(a << (k1+q)) + (b << q) + c];
689 }
690
691 for (c = 0; c < (1L << q); c++) {
692 c1 = rev_q[c];
693 for (a1 = 0; a1 < (1L << q); a1++)
694 B[(c1 << (k1+q)) + (b1 << q) + a1] = T[(a1 << q) + c];
695 }
696 }
697 }
698
699 static
700 void BitReverseCopy(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
701 {
702 if (k <= NTL_BRC_THRESH)
703 BasicBitReverseCopy(B, A, k);
704 else
705 COBRA(B, A, k);
706 }
707
708
709 static
710 void BasicBitReverseCopy(unsigned long * NTL_RESTRICT B,
711 const long * NTL_RESTRICT A, long k)
712 {
713 Vec<long> *brc_mem = get_brc_mem();
714
715 long n = 1L << k;
716 long* NTL_RESTRICT rev;
717 long i, j;
718
719 rev = brc_mem[k].elts();
720 if (!rev) rev = BRC_init(k);
721
722 for (i = 0; i < n; i++)
723 B[rev[i]] = A[i];
724 }
725
726
727
728 static
729 void COBRA(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
730 {
731 Vec<long> *brc_mem = get_brc_mem();
732
733 NTL_TLS_LOCAL(Vec<unsigned long>, BRC_temp);
734
735 long q = NTL_BRC_Q;
736 long k1 = k - 2*q;
737 long * NTL_RESTRICT rev_k1, * NTL_RESTRICT rev_q;
738 unsigned long *NTL_RESTRICT T;
739 long a, b, c, a1, b1, c1;
740 long i, j;
741
742 rev_k1 = brc_mem[k1].elts();
743 if (!rev_k1) rev_k1 = BRC_init(k1);
744
745 rev_q = brc_mem[q].elts();
746 if (!rev_q) rev_q = BRC_init(q);
747
748 T = BRC_temp.elts();
749 if (!T) {
750 BRC_temp.SetLength(1L << (2*q));
751 T = BRC_temp.elts();
752 }
753
754 for (b = 0; b < (1L << k1); b++) {
755 b1 = rev_k1[b];
756 for (a = 0; a < (1L << q); a++) {
757 a1 = rev_q[a];
758 for (c = 0; c < (1L << q); c++)
759 T[(a1 << q) + c] = A[(a << (k1+q)) + (b << q) + c];
760 }
761
762 for (c = 0; c < (1L << q); c++) {
763 c1 = rev_q[c];
764 for (a1 = 0; a1 < (1L << q); a1++)
765 B[(c1 << (k1+q)) + (b1 << q) + a1] = T[(a1 << q) + c];
766 }
767 }
768 }
769
770 static
771 void BitReverseCopy(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
772 {
773 if (k <= NTL_BRC_THRESH)
774 BasicBitReverseCopy(B, A, k);
775 else
776 COBRA(B, A, k);
777 }
778
779
780
781
782 #endif
783504
784505
785506 #ifdef NTL_FFT_LAZYMUL
798519
799520 #endif
800521
801 #ifndef NTL_FFT_BIGTAB
802
803 #define NTL_FFT_ROUTINE_TAB FFT_aux
804 #define NTL_FFT_ROUTINE_NOTAB FFT
805
806 #else
807
808 #define NTL_FFT_ROUTINE_TAB FFT
809 #define NTL_FFT_ROUTINE_NOTAB FFT_aux
810
522
523
524
525 #ifdef NTL_FFT_LAZYMUL
526 // FFT with lazy multiplication
527
528 #ifdef NTL_CLEAN_INT
529 #define NTL_FFT_USEBUF
811530 #endif
812
813
814
815
816
817
818 #ifndef NTL_FFT_LAZYMUL
819
820
821 // A basic FFT, no tables, no lazy strategy
822
823 void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
824 // performs a 2^k-point convolution modulo q
825
826 {
827 long q = info.q;
828 const long *root = info.RootTable[dir].elts();
829 mulmod_t qinv = info.qinv;
830
831 if (k <= 1) {
832 if (k == 0) {
833 A[0] = a[0];
834 return;
835 }
836 if (k == 1) {
837 long a0 = AddMod(a[0], a[1], q);
838 long a1 = SubMod(a[0], a[1], q);
839 A[0] = a0;
840 A[1] = a1;
841 return;
842 }
843 }
844
845 // assume k > 1
846
847 NTL_TLS_LOCAL(Vec<long>, wtab_store);
848 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
849 NTL_TLS_LOCAL(Vec<long>, AA_store);
850
851 wtab_store.SetLength(1L << (k-2));
852 wqinvtab_store.SetLength(1L << (k-2));
853 AA_store.SetLength(1L << k);
854
855 long * NTL_RESTRICT wtab = wtab_store.elts();
856 mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();
857 long *AA = AA_store.elts();
858
859 wtab[0] = 1;
860 wqinvtab[0] = PrepMulModPrecon(1, q, qinv);
861
862
863 BitReverseCopy(AA, a, k);
864
865 long n = 1L << k;
866
867 long s, m, m_half, m_fourth, i, j, t, u, t1, u1, tt, tt1;
868
869 long w;
870 mulmod_precon_t wqinv;
871
872 // s = 1
873
874 for (i = 0; i < n; i += 2) {
875 t = AA[i + 1];
876 u = AA[i];
877 AA[i] = AddMod(u, t, q);
878 AA[i+1] = SubMod(u, t, q);
879 }
880
881
882
883 for (s = 2; s < k; s++) {
884 m = 1L << s;
885 m_half = 1L << (s-1);
886 m_fourth = 1L << (s-2);
887
888 w = root[s];
889 wqinv = PrepMulModPrecon(w, q, qinv);
890
891 // prepare wtab...
892
893 #if 1
894 // plain version...
895
896 for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
897 long w_j = wtab[j];
898 mulmod_precon_t wqi_j = wqinvtab[j];
899 long w_i = MulModPrecon(w_j, w, q, wqinv);
900 mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
901
902 wtab[i-1] = w_j;
903 wqinvtab[i-1] = wqi_j;
904 wtab[i] = w_i;
905 wqinvtab[i] = wqi_i;
906 }
907 #else
908 // software pipeline version...doesn't seem to make a big difference
909
910 if (s == 2) {
911 wtab[1] = MulModPrecon(wtab[0], w, q, wqinv);
912 wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
913 }
914 else {
915 i = m_half-1; j = m_fourth-1;
916 wtab[i-1] = wtab[j];
917 wqinvtab[i-1] = wqinvtab[j];
918 wtab[i] = MulModPrecon(wtab[i-1], w, q, wqinv);
919
920 i -= 2; j --;
921
922 for (; i >= 0; i -= 2, j --) {
923 long wp2 = wtab[i+2];
924 long wm1 = wtab[j];
925 wqinvtab[i+2] = PrepMulModPrecon(wp2, q, qinv);
926 wtab[i-1] = wm1;
927 wqinvtab[i-1] = wqinvtab[j];
928 wtab[i] = MulModPrecon(wm1, w, q, wqinv);
929 }
930
931 wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
932 }
933
934
935 #endif
936
937
938 for (i = 0; i < n; i+= m) {
939
940 long * NTL_RESTRICT AA0 = &AA[i];
941 long * NTL_RESTRICT AA1 = &AA[i + m_half];
942
943
944
945 #if 1
946 // loop unrolling and pipelining
947
948 t = AA1[0];
949 u = AA0[0];
950 t1 = MulModPrecon(AA1[1], w, q, wqinv);
951 u1 = AA0[1];
952
953
954
955 for (j = 0; j < m_half-2; j += 2) {
956 long a02 = AA0[j+2];
957 long a03 = AA0[j+3];
958 long a12 = AA1[j+2];
959 long a13 = AA1[j+3];
960 long w2 = wtab[j+2];
961 long w3 = wtab[j+3];
962 mulmod_precon_t wqi2 = wqinvtab[j+2];
963 mulmod_precon_t wqi3 = wqinvtab[j+3];
964
965 tt = MulModPrecon(a12, w2, q, wqi2);
966 long b00 = AddMod(u, t, q);
967 long b10 = SubMod(u, t, q);
968 t = tt;
969 u = a02;
970
971 tt1 = MulModPrecon(a13, w3, q, wqi3);
972 long b01 = AddMod(u1, t1, q);
973 long b11 = SubMod(u1, t1, q);
974 t1 = tt1;
975 u1 = a03;
976
977 AA0[j] = b00;
978 AA1[j] = b10;
979 AA0[j+1] = b01;
980 AA1[j+1] = b11;
981 }
982
983
984 AA0[j] = AddMod(u, t, q);
985 AA1[j] = SubMod(u, t, q);
986 AA0[j + 1] = AddMod(u1, t1, q);
987 AA1[j + 1] = SubMod(u1, t1, q);
988
989
990 #else
991 // no loop unrolling, but still some pipelining
992
993
994 t = AA1[0];
995 u = AA0[0];
996
997 for (j = 0; j < m_half-1; j++) {
998 long a02 = AA0[j+1];
999 long a12 = AA1[j+1];
1000 long w2 = wtab[j+1];
1001 mulmod_precon_t wqi2 = wqinvtab[j+1];
1002
1003 tt = MulModPrecon(a12, w2, q, wqi2);
1004 long b00 = AddMod(u, t, q);
1005 long b10 = SubMod(u, t, q);
1006 t = tt;
1007 u = a02;
1008
1009 AA0[j] = b00;
1010 AA1[j] = b10;
1011 }
1012
1013
1014 AA0[j] = AddMod(u, t, q);
1015 AA1[j] = SubMod(u, t, q);
1016
1017
1018 #endif
1019 }
1020 }
1021
1022
1023 // s == k...special case
1024
1025 m = 1L << s;
1026 m_half = 1L << (s-1);
1027 m_fourth = 1L << (s-2);
1028
1029
1030 w = root[s];
1031 wqinv = PrepMulModPrecon(w, q, qinv);
1032
1033 // j = 0, 1
1034
1035 t = AA[m_half];
1036 u = AA[0];
1037 t1 = MulModPrecon(AA[1+ m_half], w, q, wqinv);
1038 u1 = AA[1];
1039
1040 A[0] = AddMod(u, t, q);
1041 A[m_half] = SubMod(u, t, q);
1042 A[1] = AddMod(u1, t1, q);
1043 A[1 + m_half] = SubMod(u1, t1, q);
1044
1045 for (j = 2; j < m_half; j += 2) {
1046 t = MulModPrecon(AA[j + m_half], wtab[j >> 1], q, wqinvtab[j >> 1]);
1047 u = AA[j];
1048 t1 = MulModPrecon(AA[j + 1+ m_half], wtab[j >> 1], q,
1049 wqinvtab[j >> 1]);
1050 t1 = MulModPrecon(t1, w, q, wqinv);
1051 u1 = AA[j + 1];
1052
1053 A[j] = AddMod(u, t, q);
1054 A[j + m_half] = SubMod(u, t, q);
1055 A[j + 1] = AddMod(u1, t1, q);
1056 A[j + 1 + m_half] = SubMod(u1, t1, q);
1057
1058 }
1059 }
1060
1061
1062
1063
1064
1065
1066
1067 #else
1068
1069
1070
1071 // FFT with lazy multiplication
531 // DIRT: with the lazy multiplication strategy, we have to work
532 // with unisgned long's rather than long's. To avoid unnecessary
533 // copying, we simply cast long* to unsigned long*.
534 // Is this standards compliant? Does it evoke Undefined Behavior?
535 // The C++ standard before C++14 were actually somewhat inconsistent
536 // on this point.
537
538 // In all versions of the C++ and C standards, the "strict aliasing"
539 // rules [basic.lval] have always said that signed/unsigned can
540 // always alias each other. So this does not break the strict
541 // aliasing rules. However, prior to C++14, the section
542 // on Lvalue-to-rvalue conversion [conv.lval] said that
543 // this was actually UB. This has been cleared up in C++14,
544 // where now it is no longer UB. Actally, it seems that the change
545 // to C++14 was cleaning up an inconsistency in the standard
546 // itself, and not really a change in the language definition.
547
548 // In practice, it does make a significant difference in performance
549 // to avoid all these copies, so the default is avoid them.
550
551 // See: https://stackoverflow.com/questions/30048135/efficient-way-to-bit-copy-a-signed-integer-to-an-unsigned-integer
552
553 // See: https://stackoverflow.com/questions/27109701/aliasing-of-otherwise-equivalent-signed-and-unsigned-types
554 // Especially comments by Columbo regarding N3797 and [conv.lval]
555
556
557
558
559
1072560
1073561 #if (defined(NTL_LONGLONG_SP_MULMOD))
1074562
1278766 }
1279767
1280768
769 typedef long mint_t;
770 typedef unsigned long umint_t;
771 // For readability and to make it easier to adapt this
772 // code to other settings
773
1281774 static inline
1282 unsigned long LazyReduce1(unsigned long a, long q)
1283 {
1284 return sp_CorrectExcess(long(a), q);
775 umint_t LazyReduce1(umint_t a, mint_t q)
776 {
777 return sp_CorrectExcess(mint_t(a), q);
1285778 }
1286779
1287780 static inline
1288 unsigned long LazyReduce2(unsigned long a, long q)
781 umint_t LazyReduce2(umint_t a, mint_t q)
1289782 {
1290783 return sp_CorrectExcess(a, 2*q);
1291784 }
1292785
1293786
1294
1295
1296 // FFT: Lazy, no tables
1297
1298 void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1299
1300 // performs a 2^k-point convolution modulo q
1301
1302 {
1303 long q = info.q;
1304 const long *root = info.RootTable[dir].elts();
1305 mulmod_t qinv = info.qinv;
1306
1307 if (k <= 1) {
1308 if (k == 0) {
1309 A[0] = a[0];
1310 return;
1311 }
1312 if (k == 1) {
1313 long a0 = AddMod(a[0], a[1], q);
1314 long a1 = SubMod(a[0], a[1], q);
1315 A[0] = a0;
1316 A[1] = a1;
1317 return;
1318 }
1319 }
1320
1321 // assume k >= 2
1322
1323 NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
1324 AA_store.SetLength(1L << k);
1325 unsigned long *AA = AA_store.elts();
1326
1327 NTL_TLS_LOCAL(Vec<long>, wtab_store);
1328 wtab_store.SetLength(max(2, 1L << (k-2)));
1329 // allocate space for at least 2 elements, to deal with a corner case when k == 2
1330 long * NTL_RESTRICT wtab = wtab_store.elts();
1331
1332 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
1333 wqinvtab_store.SetLength(max(2, 1L << (k-2)));
1334 // allocate space for at least 2 elements, to deal with a corner case when k == 2
1335 mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();
1336
1337
1338 BitReverseCopy(AA, a, k);
1339
1340 long n = 1L << k;
1341
1342
1343 /* we work with redundant representations, in the range [0, 4q) */
1344
1345 long s, m, m_half, m_fourth, i, j;
1346 unsigned long t, u, t1, u1;
1347
1348
1349 wtab[0] = 1;
1350 wqinvtab[0] = LazyPrepMulModPrecon(1, q, qinv);
1351
1352 // s = 1
1353 for (i = 0; i < n; i += 2) {
1354 t = AA[i + 1];
1355 u = AA[i];
1356 AA[i] = u + t;
1357 AA[i+1] = u - t + q;
1358 }
1359
1360 // s = 2
1361 {
1362 long w = root[2];
1363 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1364
1365 wtab[1] = w;
1366 wqinvtab[1] = wqinv;
1367
1368
1369 for (i = 0; i < n; i += 4) {
1370
1371 unsigned long * NTL_RESTRICT AA0 = &AA[i];
1372 unsigned long * NTL_RESTRICT AA1 = &AA[i + 2];
1373
1374 {
1375 const unsigned long a11 = AA1[0];
1376 const unsigned long a01 = AA0[0];
1377
1378 const unsigned long tt1 = a11;
1379 const unsigned long uu1 = a01;
1380 const unsigned long b01 = uu1 + tt1;
1381 const unsigned long b11 = uu1 - tt1 + 2*q;
1382
1383 AA0[0] = b01;
1384 AA1[0] = b11;
1385 }
1386 {
1387 const unsigned long a11 = AA1[1];
1388 const unsigned long a01 = AA0[1];
1389
1390 const unsigned long tt1 = LazyMulModPrecon(a11, w, q, wqinv);
1391 const unsigned long uu1 = a01;
1392 const unsigned long b01 = uu1 + tt1;
1393 const unsigned long b11 = uu1 - tt1 + 2*q;
1394
1395 AA0[1] = b01;
1396 AA1[1] = b11;
1397 }
1398 }
1399 }
1400
1401
1402 // s = 3..k-1
1403
1404 for (s = 3; s < k; s++) {
1405 m = 1L << s;
1406 m_half = 1L << (s-1);
1407 m_fourth = 1L << (s-2);
1408
1409 long w = root[s];
787 // inputs in [0, 2*n), output in [0, 4*n)
788 static inline
789 umint_t LazyAddMod(umint_t a, umint_t b, mint_t n)
790 {
791 return a+b;
792 }
793
794 // inputs in [0, 2*n), output in [0, 4*n)
795 static inline
796 umint_t LazySubMod(umint_t a, umint_t b, mint_t n)
797 {
798 return a-b+2*n;
799 }
800
801 // inputs in [0, 2*n), output in [0, 2*n)
802 static inline
803 umint_t LazyAddMod2(umint_t a, umint_t b, mint_t n)
804 {
805 umint_t r = a+b;
806 return sp_CorrectExcess(r, 2*n);
807 }
808
809 // inputs in [0, 2*n), output in [0, 2*n)
810 static inline
811 umint_t LazySubMod2(umint_t a, umint_t b, mint_t n)
812 {
813 umint_t r = a-b;
814 return sp_CorrectDeficit(r, 2*n);
815 }
816
817 #ifdef NTL_AVOID_BRANCHING
818
819 // x, y in [0, 4*m)
820 // returns x + y mod 4*m, in [0, 4*m)
821 inline static umint_t
822 LazyAddMod4(umint_t x, umint_t y, mint_t m)
823 {
824 x = LazyReduce2(x, m);
825 y = LazyReduce2(y, m);
826 return x+y;
827 }
828
829 // x, y in [0, 4*m)
830 // returns x - y mod 4*m, in [0, 4*m)
831 inline static umint_t
832 LazySubMod4(umint_t x, umint_t y, mint_t m)
833 {
834 x = LazyReduce2(x, m);
835 y = LazyReduce2(y, m);
836 return x-y+2*m;
837 }
838
839 #else
840
841 static inline umint_t
842 LazyAddMod4(umint_t x, umint_t y, umint_t m)
843 {
844 y = 4*m - y;
845 umint_t z = x - y;
846 z += (x < y) ? 4*m : 0;
847 return z;
848 }
849
850
851 static inline umint_t
852 LazySubMod4(umint_t x, umint_t y, umint_t m)
853 {
854 umint_t z = x - y;
855 z += (x < y) ? 4*m : 0;
856 return z;
857 }
858
859 #endif
860
861 // Input and output in [0, 4*n)
862 static inline umint_t
863 LazyDoubleMod4(umint_t a, mint_t n)
864 {
865 return 2 * LazyReduce2(a, n);
866 }
867
868 // Input and output in [0, 2*n)
869 static inline umint_t
870 LazyDoubleMod2(umint_t a, mint_t n)
871 {
872 return 2 * LazyReduce1(a, n);
873 }
874
875 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
876 {
877
878 long old_len = v.length();
879 v.SetLength(k+1);
880
881 for (long s = max(old_len, 1); s <= k; s++) {
882 v[s].wtab_precomp.SetLength(1L << (s-1));
883 v[s].wqinvtab_precomp.SetLength(1L << (s-1));
884 }
885
886 if (k >= 1) {
887 v[1].wtab_precomp[0] = 1;
888 v[1].wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
889 }
890
891 if (k >= 2) {
892 v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
893 v[2].wtab_precomp[1] = root[2];
894 v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
895 v[2].wqinvtab_precomp[1] = LazyPrepMulModPrecon(root[2], q, qinv);
896 }
897
898 for (long s = 3; s <= k; s++) {
899 long m = 1L << s;
900 long m_half = 1L << (s-1);
901 long m_fourth = 1L << (s-2);
902 mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
903 mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
904 mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
905 mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
906
907 mint_t w = root[s];
908 umint_t wqinv_rem;
909 mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
910
911
912 for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
913 mint_t w_j = wtab1[j];
914 mulmod_precon_t wqi_j = wqinvtab1[j];
1410915
1411916 #if 0
1412 // This computes all the multipliers in a straightforward fashion.
1413 // It's a bit slower that the strategy used below, even if
1414 // NTL_LONGLONG_SP_MULMOD is set
1415
1416 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1417
1418
1419 for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1420 long w_j = wtab[j];
1421 mulmod_precon_t wqi_j = wqinvtab[j];
1422
1423 long w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
1424 mulmod_precon_t wqi_i = LazyPrepMulModPrecon(w_i, q, qinv);
917 mint_t w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
918 mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
919 + cast_unsigned(w_j)*wqinv;
920 #else
921 // This code sequence makes sure the compiler sees
922 // that the product w_j*wqinv needs to be computed just once
923 ll_type x;
924 ll_mul(x, w_j, wqinv);
925 umint_t hi = ll_get_hi(x);
926 umint_t lo = ll_get_lo(x);
927 umint_t r = cast_unsigned(w_j)*cast_unsigned(w) - hi*cast_unsigned(q);
928
929 mint_t w_i = LazyReduce1(r, q);
930 mulmod_precon_t wqi_i = lo+LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j);
931 #endif
1425932
1426933 wtab[i-1] = w_j;
1427934 wqinvtab[i-1] = wqi_j;
1428935 wtab[i] = w_i;
1429936 wqinvtab[i] = wqi_i;
1430937 }
938 }
939
940 #if 0
941 // verify result
942 for (long s = 1; s <= k; s++) {
943 mint_t *wtab = v[s].wtab_precomp.elts();
944 mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
945 long m_half = 1L << (s-1);
946
947 mint_t w = root[s];
948 mint_t w_i = 1;
949 for (long i = 0; i < m_half; i++) {
950 if (wtab[i] != w_i || wqinvtab[i] != LazyPrepMulModPrecon(w_i, q, qinv))
951 Error("bad table entry");
952 w_i = MulMod(w_i, w, q, qinv);
953 }
954 }
955 #endif
956 }
957
958
1431959 #else
1432 unsigned long wqinv_rem;
1433 mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
1434
1435
1436 for (i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1437 long w_j = wtab[j];
1438 mulmod_precon_t wqi_j = wqinvtab[j];
1439
1440 // The next two lines are equivalent, but the first involves
1441 // a computation of hi(w_j*wqinv), which pairs with the
1442 // computation of lo(w_j*wqinv) below...but I don't think
1443 // the compiler sees this...oh well...
1444
1445 long w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
1446 // long w_i = LazyReduce1(LazyMulModPrecon(w, w_j, q, wqi_j), q);
1447
1448 mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
1449 + cast_unsigned(w_j)*wqinv;
960
961
962 // Hacks to make the LAZY code work with ordinary modular arithmetic
963
964 typedef long mint_t;
965 typedef long umint_t;
966
967 static inline mint_t IdentityMod(mint_t a, mint_t q) { return a; }
968 static inline mint_t DoubleMod(mint_t a, mint_t q) { return AddMod(a, a, q); }
969
970 #define LazyPrepMulModPrecon PrepMulModPrecon
971 #define LazyMulModPrecon MulModPrecon
972
973 #define LazyReduce1 IdentityMod
974 #define LazyReduce2 IdentityMod
975 #define LazyAddMod AddMod
976 #define LazySubMod SubMod
977 #define LazyAddMod2 AddMod
978 #define LazySubMod2 SubMod
979 #define LazyAddMod4 AddMod
980 #define LazySubMod4 SubMod
981 #define LazyDoubleMod2 DoubleMod
982 #define LazyDoubleMod4 DoubleMod
983
984
985 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
986 {
987
988 long old_len = v.length();
989 v.SetLength(k+1);
990
991 for (long s = max(old_len, 1); s <= k; s++) {
992 v[s].wtab_precomp.SetLength(1L << (s-1));
993 v[s].wqinvtab_precomp.SetLength(1L << (s-1));
994 }
995
996 if (k >= 1) {
997 v[1].wtab_precomp[0] = 1;
998 v[1].wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
999 }
1000
1001 if (k >= 2) {
1002 v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
1003 v[2].wtab_precomp[1] = root[2];
1004 v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
1005 v[2].wqinvtab_precomp[1] = PrepMulModPrecon(root[2], q, qinv);
1006 }
1007
1008 for (long s = 3; s <= k; s++) {
1009 long m = 1L << s;
1010 long m_half = 1L << (s-1);
1011 long m_fourth = 1L << (s-2);
1012 mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
1013 mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
1014 mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
1015 mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
1016
1017 mint_t w = root[s];
1018 mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
1019
1020
1021 for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1022 mint_t w_j = wtab1[j];
1023 mulmod_precon_t wqi_j = wqinvtab1[j];
1024
1025 mint_t w_i = MulModPrecon(w_j, w, q, wqinv);
1026 mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
14501027
14511028 wtab[i-1] = w_j;
14521029 wqinvtab[i-1] = wqi_j;
14531030 wtab[i] = w_i;
14541031 wqinvtab[i] = wqi_i;
14551032 }
1456
1457
1033 }
1034
1035 #if 0
1036 // verify result
1037 for (long s = 1; s <= k; s++) {
1038 mint_t *wtab = v[s].wtab_precomp.elts();
1039 mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
1040 long m_half = 1L << (s-1);
1041
1042 mint_t w = root[s];
1043 mint_t w_i = 1;
1044 for (long i = 0; i < m_half; i++) {
1045 if (wtab[i] != w_i || wqinvtab[i] != PrepMulModPrecon(w_i, q, qinv))
1046 Error("bad table entry");
1047 w_i = MulMod(w_i, w, q, qinv);
1048 }
1049 }
14581050 #endif
1459
1460 for (i = 0; i < n; i += m) {
1461
1462 unsigned long * NTL_RESTRICT AA0 = &AA[i];
1463 unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
1464
1465
1466 for (j = 0; j < m_half; j += 4) {
1467 {
1468 const long w1 = wtab[j+0];
1469 const mulmod_precon_t wqi1 = wqinvtab[j+0];
1470 const unsigned long a11 = AA1[j+0];
1471 const unsigned long a01 = AA0[j+0];
1472
1473 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1474 const unsigned long uu1 = LazyReduce2(a01, q);
1475 const unsigned long b01 = uu1 + tt1;
1476 const unsigned long b11 = uu1 - tt1 + 2*q;
1477
1478 AA0[j+0] = b01;
1479 AA1[j+0] = b11;
1480 }
1481 {
1482 const long w1 = wtab[j+1];
1483 const mulmod_precon_t wqi1 = wqinvtab[j+1];
1484 const unsigned long a11 = AA1[j+1];
1485 const unsigned long a01 = AA0[j+1];
1486
1487 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1488 const unsigned long uu1 = LazyReduce2(a01, q);
1489 const unsigned long b01 = uu1 + tt1;
1490 const unsigned long b11 = uu1 - tt1 + 2*q;
1491
1492 AA0[j+1] = b01;
1493 AA1[j+1] = b11;
1494 }
1495 {
1496 const long w1 = wtab[j+2];
1497 const mulmod_precon_t wqi1 = wqinvtab[j+2];
1498 const unsigned long a11 = AA1[j+2];
1499 const unsigned long a01 = AA0[j+2];
1500
1501 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1502 const unsigned long uu1 = LazyReduce2(a01, q);
1503 const unsigned long b01 = uu1 + tt1;
1504 const unsigned long b11 = uu1 - tt1 + 2*q;
1505
1506 AA0[j+2] = b01;
1507 AA1[j+2] = b11;
1508 }
1509 {
1510 const long w1 = wtab[j+3];
1511 const mulmod_precon_t wqi1 = wqinvtab[j+3];
1512 const unsigned long a11 = AA1[j+3];
1513 const unsigned long a01 = AA0[j+3];
1514
1515 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1516 const unsigned long uu1 = LazyReduce2(a01, q);
1517 const unsigned long b01 = uu1 + tt1;
1518 const unsigned long b11 = uu1 - tt1 + 2*q;
1519
1520 AA0[j+3] = b01;
1521 AA1[j+3] = b11;
1522 }
1523 }
1524 }
1525 }
1526
1527
1528
1529 // special case: s == k to avoid extraneous computation of constants
1530
1531 if (k > 2) {
1532 s = k;
1533
1534 m = 1L << s;
1535 m_half = 1L << (s-1);
1536 m_fourth = 1L << (s-2);
1537
1538 long w = root[s];
1539 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1540
1541
1542 for (i = 0; i < n; i += m) {
1543
1544 unsigned long * NTL_RESTRICT AA0 = &AA[i];
1545 unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
1546
1547 long half_j;
1548
1549 for (j = 0, half_j = 0; j < m_half; j += 4, half_j += 2) {
1550 {
1551 const long w1 = wtab[half_j+0];
1552 const mulmod_precon_t wqi1 = wqinvtab[half_j+0];
1553 const unsigned long a11 = AA1[j+0];
1554 const unsigned long a01 = AA0[j+0];
1555
1556 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1557 const unsigned long uu1 = LazyReduce2(a01, q);
1558 const unsigned long b01 = uu1 + tt1;
1559 const unsigned long b11 = uu1 - tt1 + 2*q;
1560
1561 AA0[j+0] = b01;
1562 AA1[j+0] = b11;
1563 }
1564 {
1565 const long w1 = wtab[half_j+0];
1566 const mulmod_precon_t wqi1 = wqinvtab[half_j+0];
1567 const unsigned long a11 = AA1[j+1];
1568 const unsigned long a01 = AA0[j+1];
1569
1570 const unsigned long tt1 = LazyMulModPrecon(LazyMulModPrecon(a11, w1, q, wqi1),
1571 w, q, wqinv);
1572 const unsigned long uu1 = LazyReduce2(a01, q);
1573 const unsigned long b01 = uu1 + tt1;
1574 const unsigned long b11 = uu1 - tt1 + 2*q;
1575
1576 AA0[j+1] = b01;
1577 AA1[j+1] = b11;
1578 }
1579 {
1580 const long w1 = wtab[half_j+1];
1581 const mulmod_precon_t wqi1 = wqinvtab[half_j+1];
1582 const unsigned long a11 = AA1[j+2];
1583 const unsigned long a01 = AA0[j+2];
1584
1585 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
1586 const unsigned long uu1 = LazyReduce2(a01, q);
1587 const unsigned long b01 = uu1 + tt1;
1588 const unsigned long b11 = uu1 - tt1 + 2*q;
1589
1590 AA0[j+2] = b01;
1591 AA1[j+2] = b11;
1592 }
1593 {
1594 const long w1 = wtab[half_j+1];
1595 const mulmod_precon_t wqi1 = wqinvtab[half_j+1];
1596 const unsigned long a11 = AA1[j+3];
1597 const unsigned long a01 = AA0[j+3];
1598
1599 const unsigned long tt1 = LazyMulModPrecon(LazyMulModPrecon(a11, w1, q, wqi1),
1600 w, q, wqinv);
1601 const unsigned long uu1 = LazyReduce2(a01, q);
1602 const unsigned long b01 = uu1 + tt1;
1603 const unsigned long b11 = uu1 - tt1 + 2*q;
1604
1605 AA0[j+3] = b01;
1606 AA1[j+3] = b11;
1607 }
1608 }
1609 }
1610 }
1611
1612
1613 /* need to reduce redundant representations */
1614
1615 for (i = 0; i < n; i++) {
1616 unsigned long tmp = LazyReduce2(AA[i], q);
1617 A[i] = LazyReduce1(tmp, q);
1618 }
1619 }
1620
1051 }
16211052
16221053 #endif
16231054
16241055
16251056
1626
1627
1628
1629
1630 #ifndef NTL_FFT_LAZYMUL
1631
1632 // FFT with precomputed tables, no lazy mul
1633
16341057 static
1635 void PrecompFFTMultipliers(long k, long q, mulmod_t qinv, const long *root, const FFTMultipliers& tab)
1636 {
1637 if (k < 1) LogicError("PrecompFFTMultipliers: bad input");
1638
1639 do { // NOTE: thread safe lazy init
1640 FFTMultipliers::Builder bld(tab, k+1);
1641 long amt = bld.amt();
1642 if (!amt) break;
1643
1644 long first = k+1-amt;
1645 // initialize entries first..k
1646
1647
1648 for (long s = first; s <= k; s++) {
1649 UniquePtr<FFTVectorPair> item;
1650
1651 if (s == 0) {
1652 bld.move(item); // position 0 not used
1653 continue;
1654 }
1655
1656 if (s == 1) {
1657 item.make();
1658 item->wtab_precomp.SetLength(1);
1659 item->wqinvtab_precomp.SetLength(1);
1660 item->wtab_precomp[0] = 1;
1661 item->wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
1662 bld.move(item);
1663 continue;
1664 }
1665
1666 item.make();
1667 item->wtab_precomp.SetLength(1L << (s-1));
1668 item->wqinvtab_precomp.SetLength(1L << (s-1));
1669
1670 long m = 1L << s;
1671 long m_half = 1L << (s-1);
1672 long m_fourth = 1L << (s-2);
1673
1674 const long *wtab_last = tab[s-1]->wtab_precomp.elts();
1675 const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
1676
1677 long *wtab = item->wtab_precomp.elts();
1678 mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
1679
1680 for (long i = 0; i < m_fourth; i++) {
1681 wtab[i] = wtab_last[i];
1682 wqinvtab[i] = wqinvtab_last[i];
1683 }
1684
1685 long w = root[s];
1686 mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
1687
1688 // prepare wtab...
1689
1690 if (s == 2) {
1691 wtab[1] = MulModPrecon(wtab[0], w, q, wqinv);
1692 wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
1693 }
1694 else {
1695 // some software pipelining
1696 long i, j;
1697
1698 i = m_half-1; j = m_fourth-1;
1699 wtab[i-1] = wtab[j];
1700 wqinvtab[i-1] = wqinvtab[j];
1701 wtab[i] = MulModPrecon(wtab[i-1], w, q, wqinv);
1702
1703 i -= 2; j --;
1704
1705 for (; i >= 0; i -= 2, j --) {
1706 long wp2 = wtab[i+2];
1707 long wm1 = wtab[j];
1708 wqinvtab[i+2] = PrepMulModPrecon(wp2, q, qinv);
1709 wtab[i-1] = wm1;
1710 wqinvtab[i-1] = wqinvtab[j];
1711 wtab[i] = MulModPrecon(wm1, w, q, wqinv);
1712 }
1713
1714 wqinvtab[1] = PrepMulModPrecon(wtab[1], q, qinv);
1715 }
1716
1717 bld.move(item);
1718 }
1719 } while (0);
1720 }
1721
1722
1723 // FFT: no lazy, table
1724
1725 void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1726 // performs a 2^k-point convolution modulo q
1727
1728 {
1729 if (!info.bigtab || k > NTL_FFT_BIGTAB_MAXROOT) {
1730 NTL_FFT_ROUTINE_NOTAB(A, a, k, info, dir);
1731 return;
1732 }
1733
1734
1735 long q = info.q;
1736 const long *root = info.RootTable[dir].elts();
1737 mulmod_t qinv = info.qinv;
1738 const FFTMultipliers& tab = info.bigtab->MulTab[dir];
1739
1740
1741 if (k <= 1) {
1742 if (k == 0) {
1743 A[0] = a[0];
1744 return;
1745 }
1746 if (k == 1) {
1747 long a0 = AddMod(a[0], a[1], q);
1748 long a1 = SubMod(a[0], a[1], q);
1749 A[0] = a0;
1750 A[1] = a1;
1751 return;
1752 }
1753 }
1754
1755 // assume k > 1
1756
1757 if (k >= tab.length()) PrecompFFTMultipliers(k, q, qinv, root, tab);
1758
1759 NTL_TLS_LOCAL(Vec<long>, AA_store);
1760 AA_store.SetLength(1L << k);
1761 long *AA = AA_store.elts();
1762
1763 BitReverseCopy(AA, a, k);
1764
1765 long n = 1L << k;
1766
1767 long s, m, m_half, m_fourth, i, j, t, u, t1, u1, tt, tt1;
1768
1769 // s = 1
1770
1771 for (i = 0; i < n; i += 2) {
1772 t = AA[i + 1];
1773 u = AA[i];
1774 AA[i] = AddMod(u, t, q);
1775 AA[i+1] = SubMod(u, t, q);
1776 }
1777
1778
1779 for (s = 2; s < k; s++) {
1780 m = 1L << s;
1781 m_half = 1L << (s-1);
1782 m_fourth = 1L << (s-2);
1783
1784 const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
1785 const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
1786
1787 for (i = 0; i < n; i+= m) {
1788
1789 long * NTL_RESTRICT AA0 = &AA[i];
1790 long * NTL_RESTRICT AA1 = &AA[i + m_half];
1791
1792 #ifdef NTL_PIPELINE
1793
1794 // pipelining: seems to be faster
1795
1796 t = AA1[0];
1797 u = AA0[0];
1798 t1 = MulModPrecon(AA1[1], wtab[1], q, wqinvtab[1]);
1799 u1 = AA0[1];
1800
1801 for (j = 0; j < m_half-2; j += 2) {
1802 long a02 = AA0[j+2];
1803 long a03 = AA0[j+3];
1804 long a12 = AA1[j+2];
1805 long a13 = AA1[j+3];
1806 long w2 = wtab[j+2];
1807 long w3 = wtab[j+3];
1808 mulmod_precon_t wqi2 = wqinvtab[j+2];
1809 mulmod_precon_t wqi3 = wqinvtab[j+3];
1810
1811 tt = MulModPrecon(a12, w2, q, wqi2);
1812 long b00 = AddMod(u, t, q);
1813 long b10 = SubMod(u, t, q);
1814
1815 tt1 = MulModPrecon(a13, w3, q, wqi3);
1816 long b01 = AddMod(u1, t1, q);
1817 long b11 = SubMod(u1, t1, q);
1818
1819 AA0[j] = b00;
1820 AA1[j] = b10;
1821 AA0[j+1] = b01;
1822 AA1[j+1] = b11;
1823
1824
1825 t = tt;
1826 u = a02;
1827 t1 = tt1;
1828 u1 = a03;
1829 }
1830
1831
1832 AA0[j] = AddMod(u, t, q);
1833 AA1[j] = SubMod(u, t, q);
1834 AA0[j + 1] = AddMod(u1, t1, q);
1835 AA1[j + 1] = SubMod(u1, t1, q);
1836 }
1837 #else
1838 for (j = 0; j < m_half; j += 2) {
1839 const long a00 = AA0[j];
1840 const long a01 = AA0[j+1];
1841 const long a10 = AA1[j];
1842 const long a11 = AA1[j+1];
1843
1844 const long w0 = wtab[j];
1845 const long w1 = wtab[j+1];
1846 const mulmod_precon_t wqi0 = wqinvtab[j];
1847 const mulmod_precon_t wqi1 = wqinvtab[j+1];
1848
1849 const long tt = MulModPrecon(a10, w0, q, wqi0);
1850 const long uu = a00;
1851 const long b00 = AddMod(uu, tt, q);
1852 const long b10 = SubMod(uu, tt, q);
1853
1854 const long tt1 = MulModPrecon(a11, w1, q, wqi1);
1855 const long uu1 = a01;
1856 const long b01 = AddMod(uu1, tt1, q);
1857 const long b11 = SubMod(uu1, tt1, q);
1858
1859 AA0[j] = b00;
1860 AA0[j+1] = b01;
1861 AA1[j] = b10;
1862 AA1[j+1] = b11;
1863 }
1864 }
1865 #endif
1866 }
1867
1868
1869 // s == k, special case
1870 {
1871 m = 1L << s;
1872 m_half = 1L << (s-1);
1873 m_fourth = 1L << (s-2);
1874
1875 const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
1876 const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
1877
1878 for (i = 0; i < n; i+= m) {
1879
1880 long * NTL_RESTRICT AA0 = &AA[i];
1881 long * NTL_RESTRICT AA1 = &AA[i + m_half];
1882 long * NTL_RESTRICT A0 = &A[i];
1883 long * NTL_RESTRICT A1 = &A[i + m_half];
1884
1885 #ifdef NTL_PIPELINE
1886
1887 // pipelining: seems to be faster
1888
1889 t = AA1[0];
1890 u = AA0[0];
1891 t1 = MulModPrecon(AA1[1], wtab[1], q, wqinvtab[1]);
1892 u1 = AA0[1];
1893
1894 for (j = 0; j < m_half-2; j += 2) {
1895 long a02 = AA0[j+2];
1896 long a03 = AA0[j+3];
1897 long a12 = AA1[j+2];
1898 long a13 = AA1[j+3];
1899 long w2 = wtab[j+2];
1900 long w3 = wtab[j+3];
1901 mulmod_precon_t wqi2 = wqinvtab[j+2];
1902 mulmod_precon_t wqi3 = wqinvtab[j+3];
1903
1904 tt = MulModPrecon(a12, w2, q, wqi2);
1905 long b00 = AddMod(u, t, q);
1906 long b10 = SubMod(u, t, q);
1907
1908 tt1 = MulModPrecon(a13, w3, q, wqi3);
1909 long b01 = AddMod(u1, t1, q);
1910 long b11 = SubMod(u1, t1, q);
1911
1912 A0[j] = b00;
1913 A1[j] = b10;
1914 A0[j+1] = b01;
1915 A1[j+1] = b11;
1916
1917
1918 t = tt;
1919 u = a02;
1920 t1 = tt1;
1921 u1 = a03;
1922 }
1923
1924
1925 A0[j] = AddMod(u, t, q);
1926 A1[j] = SubMod(u, t, q);
1927 A0[j + 1] = AddMod(u1, t1, q);
1928 A1[j + 1] = SubMod(u1, t1, q);
1929 }
1930 #else
1931 for (j = 0; j < m_half; j += 2) {
1932 const long a00 = AA0[j];
1933 const long a01 = AA0[j+1];
1934 const long a10 = AA1[j];
1935 const long a11 = AA1[j+1];
1936
1937 const long w0 = wtab[j];
1938 const long w1 = wtab[j+1];
1939 const mulmod_precon_t wqi0 = wqinvtab[j];
1940 const mulmod_precon_t wqi1 = wqinvtab[j+1];
1941
1942 const long tt = MulModPrecon(a10, w0, q, wqi0);
1943 const long uu = a00;
1944 const long b00 = AddMod(uu, tt, q);
1945 const long b10 = SubMod(uu, tt, q);
1946
1947 const long tt1 = MulModPrecon(a11, w1, q, wqi1);
1948 const long uu1 = a01;
1949 const long b01 = AddMod(uu1, tt1, q);
1950 const long b11 = SubMod(uu1, tt1, q);
1951
1952 A0[j] = b00;
1953 A0[j+1] = b01;
1954 A1[j] = b10;
1955 A1[j+1] = b11;
1956 }
1957 }
1958 #endif
1959 }
1960
1961 }
1962
1963
1964
1965
1966
1967
1968 #else
1969
1970 // FFT with precomputed tables, lazy mul
1971
1972
1973 static
1974 void LazyPrecompFFTMultipliers(long k, long q, mulmod_t qinv, const long *root, const FFTMultipliers& tab)
1058 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const FFTMultipliers& tab)
19751059 {
19761060 if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
19771061
20101094 long m_half = 1L << (s-1);
20111095 long m_fourth = 1L << (s-2);
20121096
2013 const long *wtab_last = tab[s-1]->wtab_precomp.elts();
1097 const mint_t *wtab_last = tab[s-1]->wtab_precomp.elts();
20141098 const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
20151099
2016 long *wtab = item->wtab_precomp.elts();
1100 mint_t *wtab = item->wtab_precomp.elts();
20171101 mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
20181102
20191103 for (long i = 0; i < m_fourth; i++) {
20211105 wqinvtab[i] = wqinvtab_last[i];
20221106 }
20231107
2024 long w = root[s];
1108 mint_t w = root[s];
20251109 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
20261110
20271111 // prepare wtab...
20311115 wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
20321116 }
20331117 else {
2034 // some software pipelining
20351118 long i, j;
20361119
20371120 i = m_half-1; j = m_fourth-1;
20421125 i -= 2; j --;
20431126
20441127 for (; i >= 0; i -= 2, j --) {
2045 long wp2 = wtab[i+2];
2046 long wm1 = wtab[j];
1128 mint_t wp2 = wtab[i+2];
1129 mint_t wm1 = wtab[j];
20471130 wqinvtab[i+2] = LazyPrepMulModPrecon(wp2, q, qinv);
20481131 wtab[i-1] = wm1;
20491132 wqinvtab[i-1] = wqinvtab[j];
20591142 }
20601143
20611144
2062
2063
2064 #ifdef NTL_BRC_TEST
2065 bool BRC_test_flag = false;
2066 #endif
2067
2068
2069 // FFT: lazy, tables
2070
2071 void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
2072
2073 // performs a 2^k-point convolution modulo q
2074
2075 {
2076 if (!info.bigtab || k > NTL_FFT_BIGTAB_MAXROOT) {
2077 NTL_FFT_ROUTINE_NOTAB(A, a, k, info, dir);
1145 //===================================================================
1146
1147 // TRUNCATED FFT
1148
1149 // This code is derived from code originally developed
1150 // by David Harvey. I include his original documentation,
1151 // annotated appropriately to highlight differences in
1152 // the implemebtation (see NOTEs).
1153
1154 /*
1155 The DFT is defined as follows.
1156
1157 Let the input sequence be a_0, ..., a_{N-1}.
1158
1159 Let w = standard primitive N-th root of 1, i.e. w = g^(2^FFT62_MAX_LGN / N),
1160 where g = some fixed element of Z/pZ of order 2^FFT62_MAX_LGN.
1161
1162 Let Z = an element of (Z/pZ)^* (twisting parameter).
1163
1164 Then the output sequence is
1165 b_j = \sum_{0 <= i < N} Z^i a_i w^(ij'), for 0 <= j < N,
1166 where j' is the length-lgN bit-reversal of j.
1167
1168 Some of the FFT routines can operate on truncated sequences of certain
1169 "admissible" sizes. A size parameter n is admissible if 1 <= n <= N, and n is
1170 divisible by a certain power of 2. The precise power depends on the recursive
1171 array decomposition of the FFT. The smallest admissible n' >= n can be
1172 obtained via fft62_next_size().
1173 */
1174
1175 // NOTE: the twising parameter is not implemented.
1176 // NOTE: the next admissible size function is called FFTRoundUp,
1177 // and is defined in FFT.h.
1178
1179
1180 /*
1181 Truncated FFT interface is as follows:
1182
1183 xn and yn must be admissible sizes for N.
1184
1185 Input in xp[] is a_0, a_1, ..., a_{xn-1}. Assumes a_i = 0 for xn <= i < N.
1186
1187 Output in yp[] is b_0, ..., b_{yn-1}, i.e. only first yn outputs are computed.
1188
1189 Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1190 2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z
1191 (and lgH is ignored).
1192
1193 The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1194
1195 Inputs are in [0, 2p), outputs are in [0, 2p).
1196
1197 threads = number of OpenMP threads to use.
1198 */
1199
1200
1201
1202 /*
1203 Inverse truncated FFT interface is as follows.
1204
1205 xn and yn must be admissible sizes for N, with yn <= xn.
1206
1207 Input in xp[] is b_0, b_1, ..., b_{yn-1}, N*a_{yn}, ..., N*a_{xn-1}.
1208
1209 Assumes a_i = 0 for xn <= i < N.
1210
1211 Output in yp[] is N*a_0, ..., N*a_{yn-1}.
1212
1213 Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1214 2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z^(-1)
1215 (and lgH is ignored).
1216
1217 The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1218
1219 Inputs are in [0, 4p), outputs are in [0, 4p).
1220
1221 threads = number of OpenMP threads to use.
1222
1223 (note: no function actually implements this interface in full generality!
1224 This is because it is tricky (and not that useful) to implement the twisting
1225 parameter when xn != yn.)
1226 */
1227
1228 // NOTE: threads and twisting parameter are not used here.
1229 // NOTE: the code has been re-written and simplified so that
1230 // everything is done in place, so xp == yp.
1231
1232
1233
1234
1235 //===================================================================
1236
1237
1238
1239
1240
1241
1242 // NOTE: these could be inlined, but I found the code generation
1243 // to be extremely sensitive to seemingly trivial changes,
1244 // so it seems safest to use macros instead.
1245 // w and wqinv are read only once.
1246 // q is read several times.
1247 // xx0, xx1 is are read once and written once
1248
1249 #define fwd_butterfly(xx0, xx1, w, q, wqinv) \
1250 do \
1251 { \
1252 umint_t x0_ = xx0; \
1253 umint_t x1_ = xx1; \
1254 umint_t t_ = LazySubMod(x0_, x1_, q); \
1255 xx0 = LazyAddMod2(x0_, x1_, q); \
1256 xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1257 } \
1258 while (0)
1259
1260 #define fwd_butterfly_neg(xx0, xx1, w, q, wqinv) \
1261 do \
1262 { \
1263 umint_t x0_ = xx0; \
1264 umint_t x1_ = xx1; \
1265 umint_t t_ = LazySubMod(x1_, x0_, q); /* NEG */ \
1266 xx0 = LazyAddMod2(x0_, x1_, q); \
1267 xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1268 } \
1269 while (0)
1270
1271 #define fwd_butterfly1(xx0, xx1, w, q, wqinv, w1, w1qinv) \
1272 do \
1273 { \
1274 umint_t x0_ = xx0; \
1275 umint_t x1_ = xx1; \
1276 umint_t t_ = LazySubMod(x0_, x1_, q); \
1277 xx0 = LazyAddMod2(x0_, x1_, q); \
1278 xx1 = LazyMulModPrecon(LazyMulModPrecon(t_, w1, q, w1qinv), w, q, wqinv); \
1279 } \
1280 while (0)
1281
1282
1283 #define fwd_butterfly0(xx0, xx1, q) \
1284 do \
1285 { \
1286 umint_t x0_ = xx0; \
1287 umint_t x1_ = xx1; \
1288 xx0 = LazyAddMod2(x0_, x1_, q); \
1289 xx1 = LazySubMod2(x0_, x1_, q); \
1290 } \
1291 while (0)
1292
1293
1294 #define NTL_NEW_FFT_THRESH (11)
1295
1296 struct new_mod_t {
1297 mint_t q;
1298 const mint_t **wtab;
1299 const mulmod_precon_t **wqinvtab;
1300 };
1301
1302
1303
1304
1305
1306 // requires size divisible by 8
1307 static void
1308 new_fft_layer(umint_t* xp, long blocks, long size,
1309 const mint_t* NTL_RESTRICT wtab,
1310 const mulmod_precon_t* NTL_RESTRICT wqinvtab,
1311 mint_t q)
1312 {
1313 size /= 2;
1314
1315 do
1316 {
1317 umint_t* NTL_RESTRICT xp0 = xp;
1318 umint_t* NTL_RESTRICT xp1 = xp + size;
1319
1320 // first 4 butterflies
1321 fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1322 fwd_butterfly(xp0[0+1], xp1[0+1], wtab[0+1], q, wqinvtab[0+1]);
1323 fwd_butterfly(xp0[0+2], xp1[0+2], wtab[0+2], q, wqinvtab[0+2]);
1324 fwd_butterfly(xp0[0+3], xp1[0+3], wtab[0+3], q, wqinvtab[0+3]);
1325
1326 // 4-way unroll
1327 for (long j = 4; j < size; j += 4) {
1328 fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1329 fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1330 fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1331 fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1332 }
1333
1334 xp += 2 * size;
1335 }
1336 while (--blocks != 0);
1337 }
1338
1339
1340 static void
1341 new_fft_last_two_layers(umint_t* xp, long blocks,
1342 const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1343 mint_t q)
1344 {
1345 // 4th root of unity
1346 mint_t w = wtab[1];
1347 mulmod_precon_t wqinv = wqinvtab[1];
1348
1349 do
1350 {
1351 umint_t u0 = xp[0];
1352 umint_t u1 = xp[1];
1353 umint_t u2 = xp[2];
1354 umint_t u3 = xp[3];
1355
1356 umint_t v0 = LazyAddMod2(u0, u2, q);
1357 umint_t v2 = LazySubMod2(u0, u2, q);
1358 umint_t v1 = LazyAddMod2(u1, u3, q);
1359 umint_t t = LazySubMod(u1, u3, q);
1360 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1361
1362 xp[0] = LazyAddMod2(v0, v1, q);
1363 xp[1] = LazySubMod2(v0, v1, q);
1364 xp[2] = LazyAddMod2(v2, v3, q);
1365 xp[3] = LazySubMod2(v2, v3, q);
1366
1367 xp += 4;
1368 }
1369 while (--blocks != 0);
1370 }
1371
1372
1373
1374 void new_fft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1375 {
1376 if (lgN == 0) return;
1377
1378 mint_t q = mod.q;
1379
1380 if (lgN == 1)
1381 {
1382 umint_t x0 = xp[0];
1383 umint_t x1 = xp[1];
1384 xp[0] = LazyAddMod2(x0, x1, q);
1385 xp[1] = LazySubMod2(x0, x1, q);
20781386 return;
2079 }
2080
2081 long q = info.q;
2082 const long *root = info.RootTable[dir].elts();
2083 mulmod_t qinv = info.qinv;
2084 const FFTMultipliers& tab = info.bigtab->MulTab[dir];
1387 }
1388
1389 const mint_t** wtab = mod.wtab;
1390 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1391
1392 long N = 1L << lgN;
1393
1394 for (long j = lgN, size = N, blocks = 1;
1395 j > 2; j--, blocks <<= 1, size >>= 1)
1396 new_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1397
1398 new_fft_last_two_layers(xp, N/4, wtab[2], wqinvtab[2], q);
1399 }
1400
1401
1402 // Implements the truncated FFT interface, described above.
1403 // All computations done in place, and xp should point to
1404 // an array of size N, all of which may be overwitten
1405 // during the computation.
1406 static
1407 void new_fft_short(umint_t* xp, long yn, long xn, long lgN,
1408 const new_mod_t& mod)
1409 {
1410 long N = 1L << lgN;
1411
1412 if (yn == N)
1413 {
1414 if (xn == N && lgN <= NTL_NEW_FFT_THRESH)
1415 {
1416 // no truncation
1417 new_fft_base(xp, lgN, mod);
1418 return;
1419 }
1420 }
1421
1422 // divide-and-conquer algorithm
1423
1424 long half = N >> 1;
1425 mint_t q = mod.q;
1426
1427 if (yn <= half)
1428 {
1429 if (xn <= half)
1430 {
1431 new_fft_short(xp, yn, xn, lgN - 1, mod);
1432 }
1433 else
1434 {
1435 xn -= half;
1436
1437 // (X, Y) -> X + Y
1438 for (long j = 0; j < xn; j++)
1439 xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1440
1441 new_fft_short(xp, yn, half, lgN - 1, mod);
1442 }
1443 }
1444 else
1445 {
1446 yn -= half;
1447
1448 umint_t* NTL_RESTRICT xp0 = xp;
1449 umint_t* NTL_RESTRICT xp1 = xp + half;
1450 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1451 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1452
1453 if (xn <= half)
1454 {
1455 // X -> (X, w*X)
1456 for (long j = 0; j < xn; j++)
1457 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1458
1459 new_fft_short(xp0, half, xn, lgN - 1, mod);
1460 new_fft_short(xp1, yn, xn, lgN - 1, mod);
1461 }
1462 else
1463 {
1464 xn -= half;
1465
1466 // (X, Y) -> (X + Y, w*(X - Y))
1467 // DIRT: assumes xn is a multiple of 4
1468 fwd_butterfly0(xp0[0], xp1[0], q);
1469 fwd_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
1470 fwd_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
1471 fwd_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
1472 for (long j = 4; j < xn; j+=4) {
1473 fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1474 fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1475 fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1476 fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1477 }
1478
1479 // X -> (X, w*X)
1480 for (long j = xn; j < half; j++)
1481 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1482
1483 new_fft_short(xp0, half, half, lgN - 1, mod);
1484 new_fft_short(xp1, yn, half, lgN - 1, mod);
1485 }
1486 }
1487 }
1488
1489 static
1490 void new_fft_short_notab(umint_t* xp, long yn, long xn, long lgN,
1491 const new_mod_t& mod, mint_t w, mint_t wqinv)
1492 // This version assumes that we only have tables up to level lgN-1,
1493 // and w generates the values at level lgN.
1494 // DIRT: requires xn even
1495 {
1496 long N = 1L << lgN;
1497
1498 // divide-and-conquer algorithm
1499
1500 long half = N >> 1;
1501 mint_t q = mod.q;
1502
1503 if (yn <= half)
1504 {
1505 if (xn <= half)
1506 {
1507 new_fft_short(xp, yn, xn, lgN - 1, mod);
1508 }
1509 else
1510 {
1511 xn -= half;
1512
1513 // (X, Y) -> X + Y
1514 for (long j = 0; j < xn; j++)
1515 xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1516
1517 new_fft_short(xp, yn, half, lgN - 1, mod);
1518 }
1519 }
1520 else
1521 {
1522 yn -= half;
1523
1524 umint_t* NTL_RESTRICT xp0 = xp;
1525 umint_t* NTL_RESTRICT xp1 = xp + half;
1526 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1527 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1528
1529 if (xn <= half)
1530 {
1531 // X -> (X, w*X)
1532 for (long j = 0, j_half = 0; j < xn; j+=2, j_half++) {
1533 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1534 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1535 wtab[j_half], q, wqinvtab[j_half]);
1536 }
1537
1538 new_fft_short(xp0, half, xn, lgN - 1, mod);
1539 new_fft_short(xp1, yn, xn, lgN - 1, mod);
1540 }
1541 else
1542 {
1543 xn -= half;
1544
1545 // (X, Y) -> (X + Y, w*(X - Y))
1546 fwd_butterfly0(xp0[0], xp1[0], q);
1547 fwd_butterfly(xp0[1], xp1[1], w, q, wqinv);
1548 long j = 2;
1549 long j_half = 1;
1550 for (; j < xn; j+=2, j_half++) {
1551 fwd_butterfly(xp0[j], xp1[j], wtab[j_half], q, wqinvtab[j_half]);
1552 fwd_butterfly1(xp0[j+1], xp1[j+1], wtab[j_half], q, wqinvtab[j_half], w, wqinv);
1553 }
1554
1555 // X -> (X, w*X)
1556 for (; j < half; j+=2, j_half++) {
1557 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1558 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1559 wtab[j_half], q, wqinvtab[j_half]);
1560 }
1561
1562 new_fft_short(xp0, half, half, lgN - 1, mod);
1563 new_fft_short(xp1, yn, half, lgN - 1, mod);
1564 }
1565 }
1566 }
1567
1568
1569 //=====
1570
1571
1572 // NOTE: these "flipped" routines perform the same
1573 // functions as their normal, "unflipped" counter-parts,
1574 // except that they work with inverted roots.
1575 // They also perform no truncation, just to keep things simple.
1576 // All of this is necessary only to implement the UpdateMap
1577 // routines for ZZ_pX and zz_pX.
1578
1579 // requires size divisible by 8
1580 static void
1581 new_fft_layer_flipped(umint_t* xp, long blocks, long size,
1582 const mint_t* wtab,
1583 const mulmod_precon_t* wqinvtab,
1584 mint_t q)
1585 {
1586 size /= 2;
1587
1588 const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1589 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1590
1591 do
1592 {
1593 umint_t* NTL_RESTRICT xp0 = xp;
1594 umint_t* NTL_RESTRICT xp1 = xp + size;
1595
1596 // first 4 butterflies
1597 fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1598 fwd_butterfly_neg(xp0[0+1], xp1[0+1], wtab1[-(0+1)], q, wqinvtab1[-(0+1)]);
1599 fwd_butterfly_neg(xp0[0+2], xp1[0+2], wtab1[-(0+2)], q, wqinvtab1[-(0+2)]);
1600 fwd_butterfly_neg(xp0[0+3], xp1[0+3], wtab1[-(0+3)], q, wqinvtab1[-(0+3)]);
1601
1602 // 4-way unroll
1603 for (long j = 4; j < size; j += 4) {
1604 fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1605 fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1606 fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1607 fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1608 }
1609
1610 xp += 2 * size;
1611 }
1612 while (--blocks != 0);
1613 }
1614
1615
1616
1617 static void
1618 new_fft_last_two_layers_flipped(umint_t* xp, long blocks,
1619 const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1620 mint_t q)
1621 {
1622 // 4th root of unity
1623 mint_t w = wtab[1];
1624 mulmod_precon_t wqinv = wqinvtab[1];
1625
1626 do
1627 {
1628 umint_t u0 = xp[0];
1629 umint_t u1 = xp[1];
1630 umint_t u2 = xp[2];
1631 umint_t u3 = xp[3];
1632
1633 umint_t v0 = LazyAddMod2(u0, u2, q);
1634 umint_t v2 = LazySubMod2(u0, u2, q);
1635 umint_t v1 = LazyAddMod2(u1, u3, q);
1636 umint_t t = LazySubMod(u3, u1, q); // NEG
1637 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1638
1639 xp[0] = LazyAddMod2(v0, v1, q);
1640 xp[1] = LazySubMod2(v0, v1, q);
1641 xp[2] = LazyAddMod2(v2, v3, q);
1642 xp[3] = LazySubMod2(v2, v3, q);
1643
1644 xp += 4;
1645 }
1646 while (--blocks != 0);
1647 }
1648
1649
1650
1651 void new_fft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1652 {
1653 if (lgN == 0) return;
1654
1655 mint_t q = mod.q;
1656
1657 if (lgN == 1)
1658 {
1659 umint_t x0 = xp[0];
1660 umint_t x1 = xp[1];
1661 xp[0] = LazyAddMod2(x0, x1, q);
1662 xp[1] = LazySubMod2(x0, x1, q);
1663 return;
1664 }
1665
1666 const mint_t** wtab = mod.wtab;
1667 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1668
1669 long N = 1L << lgN;
1670
1671 for (long j = lgN, size = N, blocks = 1;
1672 j > 2; j--, blocks <<= 1, size >>= 1)
1673 new_fft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
1674
1675 new_fft_last_two_layers_flipped(xp, N/4, wtab[2], wqinvtab[2], q);
1676 }
1677
1678
1679 static
1680 void new_fft_short_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1681 {
1682 long N = 1L << lgN;
1683
1684 if (lgN <= NTL_NEW_FFT_THRESH)
1685 {
1686 new_fft_base_flipped(xp, lgN, mod);
1687 return;
1688 }
1689
1690 // divide-and-conquer algorithm
1691
1692 long half = N >> 1;
1693 mint_t q = mod.q;
1694
1695 umint_t* NTL_RESTRICT xp0 = xp;
1696 umint_t* NTL_RESTRICT xp1 = xp + half;
1697 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN] + half;
1698 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN] + half;
1699
1700 // (X, Y) -> (X + Y, w*(X - Y))
1701
1702 fwd_butterfly0(xp0[0], xp1[0], q);
1703 fwd_butterfly_neg(xp0[1], xp1[1], wtab[-1], q, wqinvtab[-1]);
1704 fwd_butterfly_neg(xp0[2], xp1[2], wtab[-2], q, wqinvtab[-2]);
1705 fwd_butterfly_neg(xp0[3], xp1[3], wtab[-3], q, wqinvtab[-3]);
1706 for (long j = 4; j < half; j+=4) {
1707 fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab[-(j+0)], q, wqinvtab[-(j+0)]);
1708 fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab[-(j+1)], q, wqinvtab[-(j+1)]);
1709 fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab[-(j+2)], q, wqinvtab[-(j+2)]);
1710 fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab[-(j+3)], q, wqinvtab[-(j+3)]);
1711 }
1712
1713 new_fft_short_flipped(xp0, lgN - 1, mod);
1714 new_fft_short_flipped(xp1, lgN - 1, mod);
1715 }
1716
1717
1718
1719 // IFFT (inverse truncated FFT)
1720
1721
1722 #define inv_butterfly0(xx0, xx1, q) \
1723 do \
1724 { \
1725 umint_t x0_ = LazyReduce2(xx0, q); \
1726 umint_t x1_ = LazyReduce2(xx1, q); \
1727 xx0 = LazyAddMod(x0_, x1_, q); \
1728 xx1 = LazySubMod(x0_, x1_, q); \
1729 } while (0)
1730
1731
1732 #define inv_butterfly_neg(xx0, xx1, w, q, wqinv) \
1733 do \
1734 { \
1735 umint_t x0_ = LazyReduce2(xx0, q); \
1736 umint_t x1_ = xx1; \
1737 umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
1738 xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
1739 xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
1740 } while (0)
1741
1742 #define inv_butterfly(xx0, xx1, w, q, wqinv) \
1743 do \
1744 { \
1745 umint_t x0_ = LazyReduce2(xx0, q); \
1746 umint_t x1_ = xx1; \
1747 umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
1748 xx0 = LazyAddMod(x0_, t_, q); \
1749 xx1 = LazySubMod(x0_, t_, q); \
1750 } while (0)
1751
1752 #define inv_butterfly1_neg(xx0, xx1, w, q, wqinv, w1, w1qinv) \
1753 do \
1754 { \
1755 umint_t x0_ = LazyReduce2(xx0, q); \
1756 umint_t x1_ = xx1; \
1757 umint_t t_ = LazyMulModPrecon(LazyMulModPrecon(x1_, w1, q, w1qinv), w, q, wqinv); \
1758 xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
1759 xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
1760 } while (0)
1761
1762
1763 static
1764 void new_ifft_short2(umint_t* yp, long yn, long lgN, const new_mod_t& mod);
1765
1766
1767
1768 // requires size divisible by 8
1769 static void
1770 new_ifft_layer(umint_t* xp, long blocks, long size,
1771 const mint_t* wtab,
1772 const mulmod_precon_t* wqinvtab, mint_t q)
1773 {
1774
1775 size /= 2;
1776 const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1777 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1778
1779 do
1780 {
1781
1782 umint_t* NTL_RESTRICT xp0 = xp;
1783 umint_t* NTL_RESTRICT xp1 = xp + size;
1784
1785
1786 // first 4 butterflies
1787 inv_butterfly0(xp0[0], xp1[0], q);
1788 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1789 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1790 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1791
1792 // 4-way unroll
1793 for (long j = 4; j < size; j+= 4) {
1794 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1795 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1796 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1797 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1798 }
1799
1800 xp += 2 * size;
1801 }
1802 while (--blocks != 0);
1803 }
1804
1805
1806 static void
1807 new_ifft_first_two_layers(umint_t* xp, long blocks, const mint_t* wtab,
1808 const mulmod_precon_t* wqinvtab, mint_t q)
1809 {
1810 // 4th root of unity
1811 mint_t w = wtab[1];
1812 mulmod_precon_t wqinv = wqinvtab[1];
1813
1814 do
1815 {
1816 umint_t u0 = LazyReduce2(xp[0], q);
1817 umint_t u1 = LazyReduce2(xp[1], q);
1818 umint_t u2 = LazyReduce2(xp[2], q);
1819 umint_t u3 = LazyReduce2(xp[3], q);
1820
1821 umint_t v0 = LazyAddMod2(u0, u1, q);
1822 umint_t v1 = LazySubMod2(u0, u1, q);
1823 umint_t v2 = LazyAddMod2(u2, u3, q);
1824 umint_t t = LazySubMod(u2, u3, q);
1825 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1826
1827 xp[0] = LazyAddMod(v0, v2, q);
1828 xp[2] = LazySubMod(v0, v2, q);
1829 xp[1] = LazySubMod(v1, v3, q); // NEG
1830 xp[3] = LazyAddMod(v1, v3, q); // NEG
1831
1832 xp += 4;
1833 }
1834 while (--blocks != 0);
1835 }
1836
1837
1838
1839 static
1840 void new_ifft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1841 {
1842 if (lgN == 0) return;
1843
1844 mint_t q = mod.q;
1845
1846 if (lgN == 1)
1847 {
1848 umint_t x0 = LazyReduce2(xp[0], q);
1849 umint_t x1 = LazyReduce2(xp[1], q);
1850 xp[0] = LazyAddMod(x0, x1, q);
1851 xp[1] = LazySubMod(x0, x1, q);
1852 return;
1853 }
1854
1855 const mint_t** wtab = mod.wtab;
1856 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1857
1858 long blocks = 1L << (lgN - 2);
1859 new_ifft_first_two_layers(xp, blocks, wtab[2], wqinvtab[2], q);
1860 blocks >>= 1;
1861
1862 long size = 8;
1863 for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
1864 new_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1865 }
1866
1867
1868 static
1869 void new_ifft_short1(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
1870
1871 // Implements truncated inverse FFT interface, but with xn==yn.
1872 // All computations are done in place.
1873
1874 {
1875 long N = 1L << lgN;
1876
1877 if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
1878 {
1879 // no truncation
1880 new_ifft_base(xp, lgN, mod);
1881 return;
1882 }
1883
1884 // divide-and-conquer algorithm
1885
1886 long half = N >> 1;
1887 mint_t q = mod.q;
1888
1889 if (yn <= half)
1890 {
1891 // X -> 2X
1892 for (long j = 0; j < yn; j++)
1893 xp[j] = LazyDoubleMod4(xp[j], q);
1894
1895 new_ifft_short1(xp, yn, lgN - 1, mod);
1896 }
1897 else
1898 {
1899 umint_t* NTL_RESTRICT xp0 = xp;
1900 umint_t* NTL_RESTRICT xp1 = xp + half;
1901 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1902 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1903
1904 new_ifft_short1(xp0, half, lgN - 1, mod);
1905
1906 yn -= half;
1907
1908 // X -> (2X, w*X)
1909 for (long j = yn; j < half; j++)
1910 {
1911 umint_t x0 = xp0[j];
1912 xp0[j] = LazyDoubleMod4(x0, q);
1913 xp1[j] = LazyMulModPrecon(x0, wtab[j], q, wqinvtab[j]);
1914 }
1915
1916 new_ifft_short2(xp1, yn, lgN - 1, mod);
1917
1918 // (X, Y) -> (X + Y/w, X - Y/w)
1919 {
1920 const mint_t* NTL_RESTRICT wtab1 = wtab + half;
1921 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
1922
1923 // DIRT: assumes yn is a multiple of 4
1924 inv_butterfly0(xp0[0], xp1[0], q);
1925 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1926 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1927 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1928 for (long j = 4; j < yn; j+=4) {
1929 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1930 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1931 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1932 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1933 }
1934 }
1935 }
1936 }
1937
1938
1939 static
1940 void new_ifft_short1_notab(umint_t* xp, long yn, long lgN, const new_mod_t& mod,
1941 mint_t w, mulmod_precon_t wqinv,
1942 mint_t iw, mulmod_precon_t iwqinv)
1943 // This version assumes that we only have tables up to level lgN-1,
1944 // and w generates the values at level lgN.
1945 // DIRT: requires yn even
1946 {
1947 long N = 1L << lgN;
1948
1949 // divide-and-conquer algorithm
1950
1951 long half = N >> 1;
1952 mint_t q = mod.q;
1953
1954 if (yn <= half)
1955 {
1956 // X -> 2X
1957 for (long j = 0; j < yn; j++)
1958 xp[j] = LazyDoubleMod4(xp[j], q);
1959
1960 new_ifft_short1(xp, yn, lgN - 1, mod);
1961 }
1962 else
1963 {
1964 umint_t* NTL_RESTRICT xp0 = xp;
1965 umint_t* NTL_RESTRICT xp1 = xp + half;
1966 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1967 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1968
1969 new_ifft_short1(xp0, half, lgN - 1, mod);
1970
1971 yn -= half;
1972
1973 // X -> (2X, w*X)
1974 for (long j = yn, j_half = yn/2; j < half; j+=2, j_half++) {
1975 {
1976 umint_t x0 = xp0[j+0];
1977 xp0[j+0] = LazyDoubleMod4(x0, q);
1978 xp1[j+0] = LazyMulModPrecon(x0, wtab[j_half], q, wqinvtab[j_half]);
1979 }
1980 {
1981 umint_t x0 = xp0[j+1];
1982 xp0[j+1] = LazyDoubleMod4(x0, q);
1983 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(x0, w, q, wqinv),
1984 wtab[j_half], q, wqinvtab[j_half]);
1985 }
1986 }
1987
1988 new_ifft_short2(xp1, yn, lgN - 1, mod);
1989
1990 // (X, Y) -> (X + Y/w, X - Y/w)
1991 {
1992 const mint_t* NTL_RESTRICT wtab1 = wtab + half/2;
1993 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half/2;
1994
1995 inv_butterfly0(xp0[0], xp1[0], q);
1996 inv_butterfly(xp0[1], xp1[1], iw, q, iwqinv);
1997 for (long j = 2, j_half = 1; j < yn; j+=2, j_half++) {
1998 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-j_half], q, wqinvtab1[-j_half]);
1999 inv_butterfly1_neg(xp0[j+1], xp1[j+1], wtab1[-j_half], q, wqinvtab1[-j_half], iw, iwqinv);
2000 }
2001 }
2002 }
2003 }
2004
2005
2006
2007 //=========
2008
2009
2010 // requires size divisible by 8
2011 static void
2012 new_ifft_layer_flipped(umint_t* xp, long blocks, long size,
2013 const mint_t* NTL_RESTRICT wtab,
2014 const mulmod_precon_t* NTL_RESTRICT wqinvtab, mint_t q)
2015 {
2016
2017 size /= 2;
2018
2019 do
2020 {
2021
2022 umint_t* NTL_RESTRICT xp0 = xp;
2023 umint_t* NTL_RESTRICT xp1 = xp + size;
2024
2025
2026 // first 4 butterflies
2027 inv_butterfly0(xp0[0], xp1[0], q);
2028 inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2029 inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2030 inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2031
2032 // 4-way unroll
2033 for (long j = 4; j < size; j+= 4) {
2034 inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2035 inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2036 inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2037 inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2038 }
2039
2040 xp += 2 * size;
2041 }
2042 while (--blocks != 0);
2043 }
2044
2045
2046 static void
2047 new_ifft_first_two_layers_flipped(umint_t* xp, long blocks, const mint_t* wtab,
2048 const mulmod_precon_t* wqinvtab, mint_t q)
2049 {
2050 // 4th root of unity
2051 mint_t w = wtab[1];
2052 mulmod_precon_t wqinv = wqinvtab[1];
2053
2054 do
2055 {
2056 umint_t u0 = LazyReduce2(xp[0], q);
2057 umint_t u1 = LazyReduce2(xp[1], q);
2058 umint_t u2 = LazyReduce2(xp[2], q);
2059 umint_t u3 = LazyReduce2(xp[3], q);
2060
2061 umint_t v0 = LazyAddMod2(u0, u1, q);
2062 umint_t v1 = LazySubMod2(u0, u1, q);
2063 umint_t v2 = LazyAddMod2(u2, u3, q);
2064 umint_t t = LazySubMod(u2, u3, q);
2065 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
2066
2067 xp[0] = LazyAddMod(v0, v2, q);
2068 xp[2] = LazySubMod(v0, v2, q);
2069 xp[1] = LazyAddMod(v1, v3, q);
2070 xp[3] = LazySubMod(v1, v3, q);
2071
2072 xp += 4;
2073 }
2074 while (--blocks != 0);
2075 }
2076
2077
2078
2079 static
2080 void new_ifft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2081 {
2082 if (lgN == 0) return;
2083
2084 mint_t q = mod.q;
2085
2086 if (lgN == 1)
2087 {
2088 umint_t x0 = LazyReduce2(xp[0], q);
2089 umint_t x1 = LazyReduce2(xp[1], q);
2090 xp[0] = LazyAddMod(x0, x1, q);
2091 xp[1] = LazySubMod(x0, x1, q);
2092 return;
2093 }
2094
2095 const mint_t** wtab = mod.wtab;
2096 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
2097
2098 long blocks = 1L << (lgN - 2);
2099 new_ifft_first_two_layers_flipped(xp, blocks, wtab[2], wqinvtab[2], q);
2100 blocks >>= 1;
2101
2102 long size = 8;
2103 for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
2104 new_ifft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
2105 }
2106
2107
2108 static
2109 void new_ifft_short1_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2110 {
2111 long N = 1L << lgN;
2112
2113 if (lgN <= NTL_NEW_FFT_THRESH)
2114 {
2115 new_ifft_base_flipped(xp, lgN, mod);
2116 return;
2117 }
2118
2119 // divide-and-conquer algorithm
2120
2121 long half = N >> 1;
2122 mint_t q = mod.q;
2123
2124 umint_t* NTL_RESTRICT xp0 = xp;
2125 umint_t* NTL_RESTRICT xp1 = xp + half;
2126 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2127 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2128
2129 new_ifft_short1_flipped(xp0, lgN - 1, mod);
2130 new_ifft_short1_flipped(xp1, lgN - 1, mod);
2131
2132 // (X, Y) -> (X + Y*w, X - Y*w)
2133
2134 inv_butterfly0(xp0[0], xp1[0], q);
2135 inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2136 inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2137 inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2138 for (long j = 4; j < half; j+=4) {
2139 inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2140 inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2141 inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2142 inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2143 }
2144 }
2145
2146 //=========
2147
2148
2149
2150 static
2151 void new_ifft_short2(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
2152
2153 // Implements truncated inverse FFT interface, but with xn==N.
2154 // All computations are done in place.
2155
2156 {
2157 long N = 1L << lgN;
2158
2159 if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
2160 {
2161 // no truncation
2162 new_ifft_base(xp, lgN, mod);
2163 return;
2164 }
2165
2166 // divide-and-conquer algorithm
2167
2168 long half = N >> 1;
2169 mint_t q = mod.q;
2170
2171 if (yn <= half)
2172 {
2173 // X -> 2X
2174 for (long j = 0; j < yn; j++)
2175 xp[j] = LazyDoubleMod4(xp[j], q);
2176 // (X, Y) -> X + Y
2177 for (long j = yn; j < half; j++)
2178 xp[j] = LazyAddMod4(xp[j], xp[j + half], q);
2179
2180 new_ifft_short2(xp, yn, lgN - 1, mod);
2181
2182 // (X, Y) -> X - Y
2183 for (long j = 0; j < yn; j++)
2184 xp[j] = LazySubMod4(xp[j], xp[j + half], q);
2185 }
2186 else
2187 {
2188 umint_t* NTL_RESTRICT xp0 = xp;
2189 umint_t* NTL_RESTRICT xp1 = xp + half;
2190 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2191 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2192
2193 new_ifft_short1(xp0, half, lgN - 1, mod);
2194
2195 yn -= half;
2196
2197
2198 // (X, Y) -> (2X - Y, w*(X - Y))
2199 for (long j = yn; j < half; j++)
2200 {
2201 umint_t x0 = xp0[j];
2202 umint_t x1 = xp1[j];
2203 umint_t u = LazySubMod4(x0, x1, q);
2204 xp0[j] = LazyAddMod4(x0, u, q);
2205 xp1[j] = LazyMulModPrecon(u, wtab[j], q, wqinvtab[j]);
2206 }
2207
2208 new_ifft_short2(xp1, yn, lgN - 1, mod);
2209
2210 // (X, Y) -> (X + Y/w, X - Y/w)
2211 {
2212 const mint_t* NTL_RESTRICT wtab1 = wtab + half;
2213 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
2214
2215 // DIRT: assumes yn is a multiple of 4
2216 inv_butterfly0(xp0[0], xp1[0], q);
2217 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
2218 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
2219 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
2220 for (long j = 4; j < yn; j+=4) {
2221 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
2222 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
2223 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
2224 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
2225 }
2226 }
2227 }
2228 }
2229
2230
2231 //=============================================
2232
2233 // HIGH LEVEL ROUTINES
2234
2235 //=========== FFT without tables ===========
2236
2237
2238 NTL_TLS_GLOBAL_DECL(Vec<umint_t>, AA_store)
2239
2240 NTL_TLS_GLOBAL_DECL(Vec<FFTVectorPair>, mul_vec)
2241
2242 void new_fft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2243 long yn, long xn)
2244
2245 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2246 // xn and yn are as described above in the truncated FFT interface.
2247 // Both A and a should point to arrays of size 2^k,
2248 // and should either be the same or not overlap at all.
2249 // This version does not use precomputed tables.
2250
2251 {
2252 mint_t q = info.q;
20852253
20862254 if (k <= 1) {
20872255 if (k == 0) {
20892257 return;
20902258 }
20912259 if (k == 1) {
2092 long a0 = AddMod(a[0], a[1], q);
2093 long a1 = SubMod(a[0], a[1], q);
2094 A[0] = a0;
2095 A[1] = a1;
2260 mint_t A0 = AddMod(a[0], a[1], q);
2261 mint_t A1 = SubMod(a[0], a[1], q);
2262 A[0] = A0;
2263 A[1] = A1;
20962264 return;
20972265 }
20982266 }
20992267
21002268 // assume k > 1
2269 const mint_t *root = info.RootTable[0].elts();
2270 mulmod_t qinv = info.qinv;
2271
2272 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2273 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2274
2275 long n = 1L << k;
2276
2277 const mint_t *wtab[NTL_FFTMaxRoot+1];
2278 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2279
2280 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2281 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2282
2283 new_mod_t mod;
2284 mod.q = q;
2285 mod.wtab = &wtab[0];
2286 mod.wqinvtab = &wqinvtab[0];
2287
2288 mint_t w = info.RootTable[0][k];
2289 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2290
2291 #ifdef NTL_FFT_USEBUF
2292 NTL_TLS_GLOBAL_ACCESS(AA_store);
2293 AA_store.SetLength(1L << k);
2294 umint_t *AA = AA_store.elts();
2295
2296 for (long i = 0; i < xn; i++) AA[i] = a[i];
2297
2298 new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2299
2300 for (long i = 0; i < yn; i++) {
2301 A[i] = LazyReduce1(AA[i], q);
2302 }
2303 #else
2304 umint_t *AA = (umint_t *) A;
2305 if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2306
2307 new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2308
2309 for (long i = 0; i < yn; i++) {
2310 AA[i] = LazyReduce1(AA[i], q);
2311 }
2312 #endif
2313 }
2314
2315
2316 void new_fft_flipped_notab(mint_t* A, const mint_t* a, long k,
2317 const FFTPrimeInfo& info)
2318
2319 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2320 // Both A and a should point to arrays of size 2^k,
2321 // and should either be the same or not overlap at all.
2322 // This version is "flipped" -- it uses inverted roots,
2323 // multiplies by 2^{-k}, and performs no truncations.
2324 // This version does not use precomputed tables.
2325
2326 {
2327 mint_t q = info.q;
2328
2329 if (k <= 1) {
2330 if (k == 0) {
2331 A[0] = a[0];
2332 return;
2333 }
2334 if (k == 1) {
2335 mint_t two_inv = info.TwoInvTable[1];
2336 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2337 mint_t A0 = AddMod(a[0], a[1], q);
2338 mint_t A1 = SubMod(a[0], a[1], q);
2339 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2340 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2341 return;
2342 }
2343 }
2344
2345 // assume k > 1
2346 const mint_t *root = info.RootTable[1].elts();
2347 mulmod_t qinv = info.qinv;
2348
2349 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2350 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2351
2352 long n = 1L << k;
2353
2354 const mint_t *wtab[NTL_FFTMaxRoot+1];
2355 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2356
2357 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2358 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2359
2360 new_mod_t mod;
2361 mod.q = q;
2362 mod.wtab = &wtab[0];
2363 mod.wqinvtab = &wqinvtab[0];
2364
2365 mint_t w = info.RootTable[1][k];
2366 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2367
2368 mint_t two_inv = info.TwoInvTable[k];
2369 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2370
2371 #ifdef NTL_FFT_USEBUF
2372 NTL_TLS_GLOBAL_ACCESS(AA_store);
2373 AA_store.SetLength(1L << k);
2374 umint_t *AA = AA_store.elts();
2375
2376 for (long i = 0; i < n; i++) AA[i] = a[i];
2377
2378 new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2379
2380 for (long i = 0; i < n; i++) {
2381 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2382 A[i] = LazyReduce1(tmp, q);
2383 }
2384 #else
2385 umint_t *AA = (umint_t *) A;
2386 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2387
2388 new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2389
2390 for (long i = 0; i < n; i++) {
2391 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2392 AA[i] = LazyReduce1(tmp, q);
2393 }
2394
2395 #endif
2396 }
2397
2398
2399 //=========== Inverse FFT without tables ===========
2400
2401 void new_ifft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2402 long yn)
2403
2404 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2405 // yn==xn are as described above in the truncated FFT interface.
2406 // Both A and a should point to arrays of size 2^k,
2407 // and should either be the same or not overlap at all.
2408 // Multiplies by 2^{-k}.
2409 // This version does not use precomputed tables.
2410
2411 {
2412 mint_t q = info.q;
2413
2414 if (k <= 1) {
2415 if (k == 0) {
2416 A[0] = a[0];
2417 return;
2418 }
2419 if (k == 1) {
2420 mint_t two_inv = info.TwoInvTable[1];
2421 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2422 mint_t A0 = AddMod(a[0], a[1], q);
2423 mint_t A1 = SubMod(a[0], a[1], q);
2424 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2425 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2426 return;
2427 }
2428 }
2429
2430 // assume k > 1
2431 const mint_t *root = info.RootTable[0].elts();
2432 mulmod_t qinv = info.qinv;
2433
2434 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2435 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2436
2437 long n = 1L << k;
2438
2439 const mint_t *wtab[NTL_FFTMaxRoot+1];
2440 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2441
2442 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2443 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2444
2445 new_mod_t mod;
2446 mod.q = q;
2447 mod.wtab = &wtab[0];
2448 mod.wqinvtab = &wqinvtab[0];
2449
2450
2451 mint_t w = info.RootTable[0][k];
2452 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2453
2454 mint_t iw = info.RootTable[1][k];
2455 mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2456
2457 mint_t two_inv = info.TwoInvTable[k];
2458 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2459
2460 #ifdef NTL_FFT_USEBUF
2461 NTL_TLS_GLOBAL_ACCESS(AA_store);
2462 AA_store.SetLength(1L << k);
2463 umint_t *AA = AA_store.elts();
2464
2465 for (long i = 0; i < yn; i++) AA[i] = a[i];
2466
2467 new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2468
2469 for (long i = 0; i < yn; i++) {
2470 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2471 A[i] = LazyReduce1(tmp, q);
2472 }
2473 #else
2474 umint_t *AA = (umint_t *) A;
2475 if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2476
2477 new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2478
2479 for (long i = 0; i < yn; i++) {
2480 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2481 AA[i] = LazyReduce1(tmp, q);
2482 }
2483
2484 #endif
2485 }
2486
2487
2488 void new_ifft_flipped_notab(mint_t* A, const mint_t* a, long k,
2489 const FFTPrimeInfo& info)
2490
2491 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2492 // Flipped means inverse roots are used an no truncation and
2493 // no multiplication by 2^{-k}.
2494 // Both A and a should point to arrays of size 2^k,
2495 // and should either be the same or not overlap at all.
2496 // This version does not use precomputed tables.
2497
2498 {
2499 mint_t q = info.q;
2500
2501 if (k <= 1) {
2502 if (k == 0) {
2503 A[0] = a[0];
2504 return;
2505 }
2506 if (k == 1) {
2507 mint_t A0 = AddMod(a[0], a[1], q);
2508 mint_t A1 = SubMod(a[0], a[1], q);
2509 A[0] = A0;
2510 A[1] = A1;
2511 return;
2512 }
2513 }
2514
2515 // assume k > 1
2516 const mint_t *root = info.RootTable[1].elts();
2517 mulmod_t qinv = info.qinv;
2518
2519 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2520 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2521
2522 long n = 1L << k;
2523
2524 const mint_t *wtab[NTL_FFTMaxRoot+1];
2525 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2526
2527 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2528 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2529
2530 new_mod_t mod;
2531 mod.q = q;
2532 mod.wtab = &wtab[0];
2533 mod.wqinvtab = &wqinvtab[0];
2534
2535 mint_t w = info.RootTable[1][k];
2536 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2537
2538 mint_t iw = info.RootTable[0][k];
2539 mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2540
2541 #ifdef NTL_FFT_USEBUF
2542 NTL_TLS_GLOBAL_ACCESS(AA_store);
2543 AA_store.SetLength(1L << k);
2544 umint_t *AA = AA_store.elts();
2545
2546 for (long i = 0; i < n; i++) AA[i] = a[i];
2547
2548
2549 new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2550
2551 for (long i = 0; i < n; i++) {
2552 umint_t tmp = LazyReduce2(AA[i], q);
2553 A[i] = LazyReduce1(tmp, q);
2554 }
2555 #else
2556 umint_t *AA = (umint_t *) A;
2557 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2558
2559 new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2560
2561 for (long i = 0; i < n; i++) {
2562 umint_t tmp = LazyReduce2(AA[i], q);
2563 AA[i] = LazyReduce1(tmp, q);
2564 }
2565 #endif
2566 }
2567
2568
2569 #ifndef NTL_ENABLE_AVX_FFT
2570
2571 //================ FFT with tables ==============
2572
2573
2574 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2575 long yn, long xn)
2576
2577 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2578 // xn and yn are as described above in the truncated FFT interface.
2579 // Both A and a should point to arrays of size 2^k,
2580 // and should either be the same or not overlap at all.
2581
2582 {
2583 if (!info.bigtab || k > info.bigtab->bound) {
2584 new_fft_notab(A, a, k, info, yn, xn);
2585 return;
2586 }
2587
2588 mint_t q = info.q;
2589
2590 if (k <= 1) {
2591 if (k == 0) {
2592 A[0] = a[0];
2593 return;
2594 }
2595 if (k == 1) {
2596 mint_t A0 = AddMod(a[0], a[1], q);
2597 mint_t A1 = SubMod(a[0], a[1], q);
2598 A[0] = A0;
2599 A[1] = A1;
2600 return;
2601 }
2602 }
2603
2604 // assume k > 1
2605 const mint_t *root = info.RootTable[0].elts();
2606 mulmod_t qinv = info.qinv;
2607 const FFTMultipliers& tab = info.bigtab->MulTab;
21012608
21022609 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
21032610
2104 NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
2611
2612 long n = 1L << k;
2613
2614
2615 const mint_t *wtab[NTL_FFTMaxRoot+1];
2616 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2617
2618 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2619 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2620
2621 new_mod_t mod;
2622 mod.q = q;
2623 mod.wtab = &wtab[0];
2624 mod.wqinvtab = &wqinvtab[0];
2625
2626
2627
2628 #ifdef NTL_FFT_USEBUF
2629 NTL_TLS_GLOBAL_ACCESS(AA_store);
21052630 AA_store.SetLength(1L << k);
2106 unsigned long *AA = AA_store.elts();
2631 umint_t *AA = AA_store.elts();
2632
2633 for (long i = 0; i < xn; i++) AA[i] = a[i];
2634
2635 new_fft_short(AA, yn, xn, k, mod);
2636
2637 for (long i = 0; i < yn; i++) {
2638 A[i] = LazyReduce1(AA[i], q);
2639 }
2640 #else
2641 umint_t *AA = (umint_t *) A;
2642 if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2643
2644 new_fft_short(AA, yn, xn, k, mod);
2645
2646 for (long i = 0; i < yn; i++) {
2647 AA[i] = LazyReduce1(AA[i], q);
2648 }
2649 #endif
2650
2651 }
2652
2653 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2654
2655 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2656 // Both A and a should point to arrays of size 2^k,
2657 // and should either be the same or not overlap at all.
2658 // This version is "flipped" -- it uses inverted roots,
2659 // multiplies by 2^{-k}, and performs no truncations.
2660
2661 {
2662 if (!info.bigtab || k > info.bigtab->bound) {
2663 new_fft_flipped_notab(A, a, k, info);
2664 return;
2665 }
2666
2667 mint_t q = info.q;
2668
2669 if (k <= 1) {
2670 if (k == 0) {
2671 A[0] = a[0];
2672 return;
2673 }
2674 if (k == 1) {
2675 mint_t two_inv = info.TwoInvTable[1];
2676 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2677 mint_t A0 = AddMod(a[0], a[1], q);
2678 mint_t A1 = SubMod(a[0], a[1], q);
2679 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2680 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2681 return;
2682 }
2683 }
2684
2685 // assume k > 1
2686 const mint_t *root = info.RootTable[0].elts();
2687 mulmod_t qinv = info.qinv;
2688 const FFTMultipliers& tab = info.bigtab->MulTab;
2689
2690 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
21072691
21082692
21092693 long n = 1L << k;
21102694
2111 #ifndef NTL_BRC_TEST
2112 BitReverseCopy(AA, a, k);
2695
2696 const mint_t *wtab[NTL_FFTMaxRoot+1];
2697 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2698
2699 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2700 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2701
2702 new_mod_t mod;
2703 mod.q = q;
2704 mod.wtab = &wtab[0];
2705 mod.wqinvtab = &wqinvtab[0];
2706
2707 mint_t two_inv = info.TwoInvTable[k];
2708 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2709
2710
2711 #ifdef NTL_FFT_USEBUF
2712 NTL_TLS_GLOBAL_ACCESS(AA_store);
2713 AA_store.SetLength(1L << k);
2714 umint_t *AA = AA_store.elts();
2715
2716 for (long i = 0; i < n; i++) AA[i] = a[i];
2717
2718 new_fft_short_flipped(AA, k, mod);
2719
2720 for (long i = 0; i < n; i++) {
2721 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2722 A[i] = LazyReduce1(tmp, q);
2723 }
21132724 #else
2114 if (BRC_test_flag)
2115 for (long i = 0; i < n; i++) AA[i] = a[i];
2116 else
2117 BitReverseCopy(AA, a, k);
2725 umint_t *AA = (umint_t *) A;
2726 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2727
2728 new_fft_short_flipped(AA, k, mod);
2729
2730 for (long i = 0; i < n; i++) {
2731 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2732 AA[i] = LazyReduce1(tmp, q);
2733 }
21182734 #endif
2119
2120
2121
2122 /* we work with redundant representations, in the range [0, 4q) */
2123
2124
2125
2126 long s, m, m_half, m_fourth, i, j;
2127 unsigned long t, u, t1, u1;
2128
2129
2130 // s = 1
2131 for (i = 0; i < n; i += 2) {
2132 t = AA[i + 1];
2133 u = AA[i];
2134 AA[i] = u + t;
2135 AA[i+1] = u - t + q;
2136 }
2137
2138
2139 // s = 2
2140 {
2141 const long * NTL_RESTRICT wtab = tab[2]->wtab_precomp.elts();
2142 const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[2]->wqinvtab_precomp.elts();
2143
2144 const long w1 = wtab[1];
2145 const mulmod_precon_t wqi1 = wqinvtab[1];
2146
2147 for (i = 0; i < n; i += 4) {
2148
2149 unsigned long * NTL_RESTRICT AA0 = &AA[i];
2150 unsigned long * NTL_RESTRICT AA1 = &AA[i + 2];
2151
2152 {
2153 const unsigned long a11 = AA1[0];
2154 const unsigned long a01 = AA0[0];
2155
2156 const unsigned long tt1 = a11;
2157 const unsigned long uu1 = a01;
2158 const unsigned long b01 = uu1 + tt1;
2159 const unsigned long b11 = uu1 - tt1 + 2*q;
2160
2161 AA0[0] = b01;
2162 AA1[0] = b11;
2735 }
2736
2737 //======= Inverse FFT with tables ==============
2738
2739
2740 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2741 long yn)
2742
2743 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2744 // yn==xn are as described above in the truncated FFT interface.
2745 // Both A and a should point to arrays of size 2^k,
2746 // and should either be the same or not overlap at all.
2747 // Multiples by 2^{-k}.
2748
2749 {
2750 if (!info.bigtab || k > info.bigtab->bound) {
2751 new_ifft_notab(A, a, k, info, yn);
2752 return;
2753 }
2754
2755 mint_t q = info.q;
2756
2757 if (k <= 1) {
2758 if (k == 0) {
2759 A[0] = a[0];
2760 return;
2761 }
2762 if (k == 1) {
2763 mint_t two_inv = info.TwoInvTable[1];
2764 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2765 mint_t A0 = AddMod(a[0], a[1], q);
2766 mint_t A1 = SubMod(a[0], a[1], q);
2767 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2768 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2769 return;
2770 }
2771 }
2772
2773 // assume k > 1
2774 const mint_t *root = info.RootTable[0].elts();
2775 mulmod_t qinv = info.qinv;
2776 const FFTMultipliers& tab = info.bigtab->MulTab;
2777
2778 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2779
2780
2781 long n = 1L << k;
2782
2783
2784 const mint_t *wtab[NTL_FFTMaxRoot+1];
2785 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2786
2787 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2788 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2789
2790 new_mod_t mod;
2791 mod.q = q;
2792 mod.wtab = &wtab[0];
2793 mod.wqinvtab = &wqinvtab[0];
2794
2795 mint_t two_inv = info.TwoInvTable[k];
2796 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2797
2798 #ifdef NTL_FFT_USEBUF
2799 NTL_TLS_GLOBAL_ACCESS(AA_store);
2800 AA_store.SetLength(1L << k);
2801 umint_t *AA = AA_store.elts();
2802
2803 for (long i = 0; i < yn; i++) AA[i] = a[i];
2804
2805 new_ifft_short1(AA, yn, k, mod);
2806
2807 for (long i = 0; i < yn; i++) {
2808 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2809 A[i] = LazyReduce1(tmp, q);
2810 }
2811 #else
2812 umint_t *AA = (umint_t *) A;
2813 if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2814
2815 new_ifft_short1(AA, yn, k, mod);
2816
2817 for (long i = 0; i < yn; i++) {
2818 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2819 AA[i] = LazyReduce1(tmp, q);
2820 }
2821 #endif
2822 }
2823
2824
2825 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2826
2827
2828 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2829 // Flipped means inverse roots are used an no truncation and
2830 // no multiplication by 2^{-k}.
2831 // Both A and a should point to arrays of size 2^k,
2832 // and should either be the same or not overlap at all.
2833
2834
2835 {
2836 if (!info.bigtab || k > info.bigtab->bound) {
2837 new_ifft_flipped_notab(A, a, k, info);
2838 return;
2839 }
2840
2841 mint_t q = info.q;
2842
2843 if (k <= 1) {
2844 if (k == 0) {
2845 A[0] = a[0];
2846 return;
2847 }
2848 if (k == 1) {
2849 mint_t A0 = AddMod(a[0], a[1], q);
2850 mint_t A1 = SubMod(a[0], a[1], q);
2851 A[0] = A0;
2852 A[1] = A1;
2853 return;
2854 }
2855 }
2856
2857 // assume k > 1
2858 const mint_t *root = info.RootTable[0].elts();
2859 mulmod_t qinv = info.qinv;
2860 const FFTMultipliers& tab = info.bigtab->MulTab;
2861
2862 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2863
2864
2865 long n = 1L << k;
2866
2867
2868 const mint_t *wtab[NTL_FFTMaxRoot+1];
2869 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2870
2871 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2872 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2873
2874 new_mod_t mod;
2875 mod.q = q;
2876 mod.wtab = &wtab[0];
2877 mod.wqinvtab = &wqinvtab[0];
2878
2879
2880 #ifdef NTL_FFT_USEBUF
2881 NTL_TLS_GLOBAL_ACCESS(AA_store);
2882 AA_store.SetLength(1L << k);
2883 umint_t *AA = AA_store.elts();
2884
2885 for (long i = 0; i < n; i++) AA[i] = a[i];
2886
2887 new_ifft_short1_flipped(AA, k, mod);
2888
2889 for (long i = 0; i < n; i++) {
2890 umint_t tmp = LazyReduce2(AA[i], q);
2891 A[i] = LazyReduce1(tmp, q);
2892 }
2893 #else
2894 umint_t *AA = (umint_t *) A;
2895 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2896
2897 new_ifft_short1_flipped(AA, k, mod);
2898
2899 for (long i = 0; i < n; i++) {
2900 umint_t tmp = LazyReduce2(AA[i], q);
2901 AA[i] = LazyReduce1(tmp, q);
2902 }
2903 #endif
2904 }
2905
2906 #endif
2907
2908 //===============================================
2909
2910 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index)
2911 {
2912 mulmod_t qinv = PrepMulMod(q);
2913
2914 long mr = CalcMaxRoot(q);
2915
2916 info.q = q;
2917 info.qinv = qinv;
2918 info.qrecip = 1/double(q);
2919 info.zz_p_context = 0;
2920
2921
2922 info.RootTable[0].SetLength(mr+1);
2923 info.RootTable[1].SetLength(mr+1);
2924 info.TwoInvTable.SetLength(mr+1);
2925 info.TwoInvPreconTable.SetLength(mr+1);
2926
2927 long *rt = &info.RootTable[0][0];
2928 long *rit = &info.RootTable[1][0];
2929 long *tit = &info.TwoInvTable[0];
2930 mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
2931
2932 long j;
2933 long t;
2934
2935 rt[mr] = w;
2936 for (j = mr-1; j >= 0; j--)
2937 rt[j] = MulMod(rt[j+1], rt[j+1], q);
2938
2939 rit[mr] = InvMod(w, q);
2940 for (j = mr-1; j >= 0; j--)
2941 rit[j] = MulMod(rit[j+1], rit[j+1], q);
2942
2943 t = InvMod(2, q);
2944 tit[0] = 1;
2945 for (j = 1; j <= mr; j++)
2946 tit[j] = MulMod(tit[j-1], t, q);
2947
2948 for (j = 0; j <= mr; j++)
2949 tipt[j] = LazyPrepMulModPrecon(tit[j], q, qinv);
2950
2951 #ifndef NTL_ENABLE_AVX_FFT
2952 if (bigtab_index != -1) {
2953 long bound = NTL_FFT_BIGTAB_MAXROOT-bigtab_index/NTL_FFT_BIGTAB_LIMIT;
2954 if (bound > NTL_FFT_BIGTAB_MINROOT) {
2955 info.bigtab.make();
2956 info.bigtab->bound = bound;
2957 }
2958 }
2959 #else
2960 // with the AVX implementation, we unconditionally use tables
2961 info.bigtab.make();
2962 #endif
2963 }
2964
2965
2966 //===================================================================
2967
2968 #ifdef NTL_ENABLE_AVX_FFT
2969
2970 static void
2971 pd_LazyPrepMulModPrecon(double *bninv, const double *b, double n, long len)
2972 {
2973 CSRPush push;
2974 pd_LazyPrepMulModPrecon_impl(bninv, b, n, len);
2975 }
2976
2977 static
2978 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const pd_FFTMultipliers& tab)
2979 {
2980 if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
2981
2982 do { // NOTE: thread safe lazy init
2983 pd_FFTMultipliers::Builder bld(tab, k+1);
2984 long amt = bld.amt();
2985 if (!amt) break;
2986
2987 long first = k+1-amt;
2988 // initialize entries first..k
2989
2990
2991 for (long s = first; s <= k; s++) {
2992 UniquePtr<pd_FFTVectorPair> item;
2993
2994 if (s == 0) {
2995 bld.move(item); // position 0 not used
2996 continue;
21632997 }
2164 {
2165 const unsigned long a11 = AA1[1];
2166 const unsigned long a01 = AA0[1];
2167
2168 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2169 const unsigned long uu1 = a01;
2170 const unsigned long b01 = uu1 + tt1;
2171 const unsigned long b11 = uu1 - tt1 + 2*q;
2172
2173 AA0[1] = b01;
2174 AA1[1] = b11;
2998
2999 long m = 1L << s;
3000 long m_half = 1L << (s-1);
3001
3002 item.make();
3003 item->wtab_precomp.SetLength(m_half);
3004 item->wqinvtab_precomp.SetLength(m_half);
3005
3006 double *wtab = item->wtab_precomp.elts();
3007 double *wqinvtab = item->wqinvtab_precomp.elts();
3008
3009 mint_t w = root[s];
3010 mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
3011
3012 mint_t wi = 1;
3013 wtab[0] = wi;
3014 for (long i = 1; i < m_half; i++) {
3015 wi = MulModPrecon(wi, w, q, wqinv);
3016 wtab[i] = wi;
21753017 }
2176 }
2177 }
2178
2179
2180 // s = 3..k
2181
2182 for (s = 3; s <= k; s++) {
2183 m = 1L << s;
2184 m_half = 1L << (s-1);
2185 m_fourth = 1L << (s-2);
2186
2187 const long* NTL_RESTRICT wtab = tab[s]->wtab_precomp.elts();
2188 const mulmod_precon_t * NTL_RESTRICT wqinvtab = tab[s]->wqinvtab_precomp.elts();
2189
2190 for (i = 0; i < n; i += m) {
2191
2192 unsigned long * NTL_RESTRICT AA0 = &AA[i];
2193 unsigned long * NTL_RESTRICT AA1 = &AA[i + m_half];
2194
2195 #if 1
2196
2197 // a little loop unrolling: this gives the best code
2198
2199 for (j = 0; j < m_half; j += 4) {
2200 {
2201 const long w1 = wtab[j+0];
2202 const mulmod_precon_t wqi1 = wqinvtab[j+0];
2203 const unsigned long a11 = AA1[j+0];
2204 const unsigned long a01 = AA0[j+0];
2205
2206 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2207 const unsigned long uu1 = LazyReduce2(a01, q);
2208 const unsigned long b01 = uu1 + tt1;
2209 const unsigned long b11 = uu1 - tt1 + 2*q;
2210
2211 AA0[j+0] = b01;
2212 AA1[j+0] = b11;
2213 }
2214 {
2215 const long w1 = wtab[j+1];
2216 const mulmod_precon_t wqi1 = wqinvtab[j+1];
2217 const unsigned long a11 = AA1[j+1];
2218 const unsigned long a01 = AA0[j+1];
2219
2220 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2221 const unsigned long uu1 = LazyReduce2(a01, q);
2222 const unsigned long b01 = uu1 + tt1;
2223 const unsigned long b11 = uu1 - tt1 + 2*q;
2224
2225 AA0[j+1] = b01;
2226 AA1[j+1] = b11;
2227 }
2228 {
2229 const long w1 = wtab[j+2];
2230 const mulmod_precon_t wqi1 = wqinvtab[j+2];
2231 const unsigned long a11 = AA1[j+2];
2232 const unsigned long a01 = AA0[j+2];
2233
2234 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2235 const unsigned long uu1 = LazyReduce2(a01, q);
2236 const unsigned long b01 = uu1 + tt1;
2237 const unsigned long b11 = uu1 - tt1 + 2*q;
2238
2239 AA0[j+2] = b01;
2240 AA1[j+2] = b11;
2241 }
2242 {
2243 const long w1 = wtab[j+3];
2244 const mulmod_precon_t wqi1 = wqinvtab[j+3];
2245 const unsigned long a11 = AA1[j+3];
2246 const unsigned long a01 = AA0[j+3];
2247
2248 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2249 const unsigned long uu1 = LazyReduce2(a01, q);
2250 const unsigned long b01 = uu1 + tt1;
2251 const unsigned long b11 = uu1 - tt1 + 2*q;
2252
2253 AA0[j+3] = b01;
2254 AA1[j+3] = b11;
2255 }
2256 }
2257
2258 #else
2259
2260 // a plain loop: not as good as the unrolled version
2261
2262 for (j = 0; j < m_half; j++) {
2263 const long w1 = wtab[j];
2264 const mulmod_precon_t wqi1 = wqinvtab[j];
2265 const unsigned long a11 = AA1[j];
2266 const unsigned long a01 = AA0[j];
2267
2268 const unsigned long tt1 = LazyMulModPrecon(a11, w1, q, wqi1);
2269 const unsigned long uu1 = LazyReduce2(a01, q);
2270 const unsigned long b01 = uu1 + tt1;
2271 const unsigned long b11 = uu1 - tt1 + 2*q;
2272
2273 AA0[j] = b01;
2274 AA1[j] = b11;
2275 }
3018 pd_LazyPrepMulModPrecon(wqinvtab, wtab, q, m_half);
3019
3020 bld.move(item);
3021 }
3022 } while (0);
3023 }
3024
3025 NTL_TLS_GLOBAL_DECL(AlignedArray<double>, pd_AA_store)
3026 static NTL_CHEAP_THREAD_LOCAL long pd_AA_store_len = 0;
3027
3028
3029 #define PD_MIN_K (NTL_LG2_PDSZ+3)
3030 // k must be at least PD_MIN_K
3031
3032 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3033 long yn, long xn)
3034 {
3035 if (k < PD_MIN_K) {
3036 new_fft_notab(A, a, k, info, yn, xn);
3037 return;
3038 }
3039
3040 long dir = 0;
3041
3042 mint_t q = info.q;
3043 const mint_t *root = info.RootTable[dir].elts();
3044 mulmod_t qinv = info.qinv;
3045 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3046
3047 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3048
3049 const double *wtab[NTL_FFTMaxRoot+1];
3050 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3051
3052 const double *wqinvtab[NTL_FFTMaxRoot+1];
3053 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3054
3055 pd_mod_t mod;
3056 mod.q = q;
3057 mod.wtab = &wtab[0];
3058 mod.wqinvtab = &wqinvtab[0];
3059
3060 long n = 1L << k;
3061
3062 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3063 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3064 double *AA = pd_AA_store.elts();
3065
3066 CSRPush push;
3067 pd_fft_trunc_impl(A, a, AA, k, mod, yn, xn);
3068 }
3069
3070
3071
3072 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3073 {
3074 if (k < PD_MIN_K) {
3075 new_fft_flipped_notab(A, a, k, info);
3076 return;
3077 }
3078
3079 long dir = 1;
3080
3081 mint_t q = info.q;
3082 const mint_t *root = info.RootTable[dir].elts();
3083 mulmod_t qinv = info.qinv;
3084 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3085
3086 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3087
3088 const double *wtab[NTL_FFTMaxRoot+1];
3089 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3090
3091 const double *wqinvtab[NTL_FFTMaxRoot+1];
3092 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3093
3094 pd_mod_t mod;
3095 mod.q = q;
3096 mod.wtab = &wtab[0];
3097 mod.wqinvtab = &wqinvtab[0];
3098
3099 long n = 1L << k;
3100
3101 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3102 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3103 double *AA = pd_AA_store.elts();
3104
3105 CSRPush push;
3106 pd_fft_trunc_impl(A, a, AA, k, mod, n, n, info.TwoInvTable[k]);
3107 }
3108
3109
3110 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3111 long yn)
3112 {
3113 if (k < PD_MIN_K) {
3114 new_ifft_notab(A, a, k, info, yn);
3115 return;
3116 }
3117
3118 long dir = 0;
3119
3120 mint_t q = info.q;
3121 const mint_t *root = info.RootTable[1-dir].elts();
3122 const mint_t *root1 = info.RootTable[dir].elts();
3123 mulmod_t qinv = info.qinv;
3124 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3125 const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3126
3127 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3128 if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3129
3130 const double *wtab[NTL_FFTMaxRoot+1];
3131 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3132
3133 const double *wqinvtab[NTL_FFTMaxRoot+1];
3134 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3135
3136 const double *wtab1[NTL_FFTMaxRoot+1];
3137 for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3138
3139 const double *wqinvtab1[NTL_FFTMaxRoot+1];
3140 for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3141
3142 pd_mod_t mod;
3143 mod.q = q;
3144 mod.wtab = &wtab[0];
3145 mod.wqinvtab = &wqinvtab[0];
3146 mod.wtab1 = &wtab1[0];
3147 mod.wqinvtab1 = &wqinvtab1[0];
3148
3149 long n = 1L << k;
3150
3151 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3152 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3153 double *AA = pd_AA_store.elts();
3154
3155 CSRPush push;
3156 pd_ifft_trunc_impl(A, a, AA, k, mod, yn, info.TwoInvTable[k]);
3157 }
3158
3159
3160 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3161 {
3162 if (k < PD_MIN_K) {
3163 new_ifft_flipped_notab(A, a, k, info);
3164 return;
3165 }
3166
3167 long dir = 1;
3168
3169 mint_t q = info.q;
3170 const mint_t *root = info.RootTable[1-dir].elts();
3171 const mint_t *root1 = info.RootTable[dir].elts();
3172 mulmod_t qinv = info.qinv;
3173 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3174 const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3175
3176 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3177 if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3178
3179 const double *wtab[NTL_FFTMaxRoot+1];
3180 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3181
3182 const double *wqinvtab[NTL_FFTMaxRoot+1];
3183 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3184
3185 const double *wtab1[NTL_FFTMaxRoot+1];
3186 for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3187
3188 const double *wqinvtab1[NTL_FFTMaxRoot+1];
3189 for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3190
3191 pd_mod_t mod;
3192 mod.q = q;
3193 mod.wtab = &wtab[0];
3194 mod.wqinvtab = &wqinvtab[0];
3195 mod.wtab1 = &wtab1[0];
3196 mod.wqinvtab1 = &wqinvtab1[0];
3197
3198 long n = 1L << k;
3199
3200 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3201 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3202 double *AA = pd_AA_store.elts();
3203
3204 CSRPush push;
3205 pd_ifft_trunc_impl(A, a, AA, k, mod, n);
3206 }
22763207
22773208 #endif
22783209
2279 }
2280 }
2281
2282 /* need to reduce redundant representations */
2283
2284 for (i = 0; i < n; i++) {
2285 unsigned long tmp = LazyReduce2(AA[i], q);
2286 A[i] = LazyReduce1(tmp, q);
2287 }
2288 }
2289
2290
2291
2292
2293
2294 #endif
2295
2296
2297
2298
22993210
23003211 NTL_END_IMPL
1313 GF2EInfoT::GF2EInfoT(const GF2X& NewP)
1414 {
1515 build(p, NewP);
16
17 if (p.size == 1) {
18 if (deg(p) <= NTL_BITS_PER_LONG/2)
16 _card_exp = p.n;
17
18 long sz = p.size;
19
20 // The following crossovers were set using the programs
21 // GF2EXKarCross.cpp, GF2EXModCross.cpp, GF2EXModCross.cpp,
22 // and GF2EXGCDCross.cpp.
23 // To use these programs, one has to remove the #if 0 guards
24 // in GF2EX.cpp on mul_disable_plain, BuildPlain, and DivRemPlain.
25
26 // There are three different configurations that are treated separately:
27 // * with gf2x lib and with pclmul instruction available
28 // * without gf2x lib but with pclmul
29 // * without gf2X lib and without pclmul
30 // It is possible that one could be using gf2x lib on a platform without
31 // pclmul, in which case the crossovers used here are not optimal. It is also
32 // possible that one could be using gf2x lib with pclmul, but compile NTL with
33 // NATIVE=off, so that NTL assumes there is no pclmul. Again, this will lead
34 // to crossovers that are not optimal.
35
36 // The crossovers were calculated based on a Skylake Xeon processor:
37 // Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz.
38
39
40 #if (defined(NTL_GF2X_LIB) && defined(NTL_HAVE_PCLMUL))
41
42 //========== KarCross ==========
43
44 if (sz <= 1) {
45 if (deg(p) <= NTL_BITS_PER_LONG/2)
46 KarCross = 3;
47 else
1948 KarCross = 4;
49 }
50 else if (sz <= 6) KarCross = 8;
51 else if (sz <= 9) KarCross = 4;
52 else KarCross = 2;
53
54
55
56 //========== ModCross ==========
57
58
59 if (sz <= 1) {
60 if (deg(p) <= NTL_BITS_PER_LONG/2)
61 ModCross = 15;
62 else
63 ModCross = 20;
64 }
65 else if (sz <= 9) ModCross = 60;
66 else if (sz <= 18) ModCross = 25;
67 else ModCross = 15;
68
69
70 //========== DivCross ==========
71
72 if (sz <= 1) {
73 if (deg(p) <= NTL_BITS_PER_LONG/2)
74 DivCross = 50;
75 else
76 DivCross = 75;
77 }
78 else if (sz <= 2) DivCross = 100;
79 else if (sz <= 3) DivCross = 150;
80 else if (sz <= 4) DivCross = 200;
81 else if (sz <= 6) DivCross = 250;
82 else if (sz <= 9) DivCross = 225;
83 else if (sz <= 15) DivCross = 125;
84 else if (sz < 125) DivCross = 100;
85 else DivCross = 75;
86
87 //========== GCDCross ==========
88
89 if (sz <= 1) {
90 if (deg(p) <= NTL_BITS_PER_LONG/2)
91 GCDCross = 225;
92 else
93 GCDCross = 225;
94 }
95 else if (sz <= 2) GCDCross = 450;
96 else if (sz <= 4) GCDCross = 600;
97 else if (sz < 12) GCDCross = 1150;
98 else GCDCross = 600;
99
100
101 #elif (defined(NTL_HAVE_PCLMUL))
102
103 //========== KarCross ==========
104
105 if (sz <= 1) {
106 if (deg(p) <= NTL_BITS_PER_LONG/2)
107 KarCross = 5;
20108 else
21109 KarCross = 8;
22110 }
23 else if (p.size == 2)
24 KarCross = 8;
25 else if (p.size <= 5)
26 KarCross = 4;
27 else if (p.size == 6)
28 KarCross = 3;
29 else
30 KarCross = 2;
31
32
33 if (p.size <= 1) {
34 if (deg(p) <= NTL_BITS_PER_LONG/2)
35 ModCross = 20;
36 else
37 ModCross = 40;
38 }
39 else if (p.size <= 2)
40 ModCross = 75;
41 else if (p.size <= 4)
42 ModCross = 50;
43 else
44 ModCross = 25;
45
46 if (p.size == 1) {
47 if (deg(p) <= NTL_BITS_PER_LONG/2)
48 DivCross = 100;
49 else
50 DivCross = 200;
51 }
52 else if (p.size == 2)
53 DivCross = 400;
54 else if (p.size <= 4)
55 DivCross = 200;
56 else if (p.size == 5)
57 DivCross = 150;
58 else if (p.size <= 13)
59 DivCross = 100;
60 else
61 DivCross = 75;
62
63 _card_exp = p.n;
111 else if (sz <= 5) KarCross = 8;
112 else if (sz <= 9) KarCross = 4;
113 else KarCross = 2;
114
115
116
117 //========== ModCross ==========
118
119
120 if (sz <= 1) {
121 if (deg(p) <= NTL_BITS_PER_LONG/2)
122 ModCross = 30;
123 else
124 ModCross = 45;
125 }
126 else if (sz <= 2) ModCross = 110;
127 else if (sz <= 3) ModCross = 105;
128 else if (sz <= 4) ModCross = 65;
129 else if (sz <= 5) ModCross = 60;
130 else if (sz <= 6) ModCross = 55;
131 else if (sz <= 8) ModCross = 50;
132 else if (sz <= 12) ModCross = 30;
133 else if (sz <= 18) ModCross = 25;
134 else ModCross = 15;
135
136
137
138 //========== DivCross ==========
139
140
141 if (sz <= 1) {
142 if (deg(p) <= NTL_BITS_PER_LONG/2)
143 DivCross = 75;
144 else
145 DivCross = 125;
146 }
147 else if (sz <= 2) DivCross = 450;
148 else if (sz <= 3) DivCross = 425;
149 else if (sz <= 4) DivCross = 375;
150 else if (sz <= 6) DivCross = 250;
151 else if (sz <= 8) DivCross = 225;
152 else if (sz <= 16) DivCross = 125;
153 else if (sz <= 45) DivCross = 100;
154 else DivCross = 75;
155
156
157 //========== GCDCross ==========
158
159 if (sz <= 1) {
160 if (deg(p) <= NTL_BITS_PER_LONG/2)
161 GCDCross = 225;
162 else
163 GCDCross = 225;
164 }
165 else if (sz < 12) GCDCross = 1150;
166 else GCDCross = 850;
167
168 #else
169
170 //========== KarCross ==========
171
172 if (sz <= 1) {
173 if (deg(p) <= NTL_BITS_PER_LONG/2)
174 KarCross = 4;
175 else
176 KarCross = 12;
177 }
178 else if (sz <= 3) KarCross = 4;
179 else KarCross = 2;
180
181
182
183 //========== ModCross ==========
184
185
186 if (sz <= 1) {
187 if (deg(p) <= NTL_BITS_PER_LONG/2)
188 ModCross = 45;
189 else
190 ModCross = 65;
191 }
192 else if (sz <= 2) ModCross = 25;
193 else ModCross = 15;
194
195
196 //========== DivCross ==========
197
198 if (sz <= 1) {
199 if (deg(p) <= NTL_BITS_PER_LONG/2)
200 DivCross = 175;
201 else
202 DivCross = 250;
203 }
204 else if (sz <= 4) DivCross = 100;
205 else DivCross = 75;
206
207 //========== GCDCross ==========
208
209 if (sz <= 1) {
210 if (deg(p) <= NTL_BITS_PER_LONG/2)
211 GCDCross = 225;
212 else
213 GCDCross = 850;
214 }
215 else if (sz < 8) GCDCross = 850;
216 else if (sz < 12) GCDCross = 600;
217 else GCDCross = 450;
218
219
220 #endif
221
64222 }
65223
66224
766766 return;
767767 }
768768
769 if (GF2E::WordLength() <= 1) {
769 bool use_kron_mul = false;
770
771 if (GF2E::WordLength() <= 1) use_kron_mul = true;
772
773 #if (defined(NTL_GF2X_LIB) && defined(NTL_HAVE_PCLMUL))
774 // With gf2x library and pclmul, KronMul is better in a larger range, but
775 // it is very hard to characterize that range. The following is very
776 // conservative.
777
778 if (GF2E::WordLength() <= 4 && sa >= 50 && sb >= 50) use_kron_mul = true;
779 // FIXME: figure out a larger range where KronMul is better
780 // (and don't forget to recompute crossovers in GF2E.cpp).
781 #endif
782
783
784 if (use_kron_mul) {
770785 KronMul(c, a, b);
771786 return;
772787 }
807822 }
808823
809824
825
826 #if 0
827 // used only for computing KarCross using GF2EXKarCross.cpp
828 void mul_disable_plain(GF2EX& c, const GF2EX& a, const GF2EX& b)
829 {
830 if (IsZero(a) || IsZero(b)) {
831 clear(c);
832 return;
833 }
834
835 if (&a == &b) {
836 sqr(c, a);
837 return;
838 }
839
840 long sa = a.rep.length();
841 long sb = b.rep.length();
842
843 if (sa == 1) {
844 mul(c, b, a.rep[0]);
845 return;
846 }
847
848 if (sb == 1) {
849 mul(c, a, b.rep[0]);
850 return;
851 }
852
853 if (0) {
854 //if (sa < GF2E::KarCross() || sb < GF2E::KarCross()) {
855 PlainMul(c, a, b);
856 return;
857 }
858
859 if (GF2E::WordLength() <= 1) {
860 KronMul(c, a, b);
861 return;
862 }
863
864
865 /* karatsuba */
866
867 long n, hn, sp;
868
869 n = max(sa, sb);
870 sp = 0;
871 do {
872 hn = (n+1) >> 1;
873 sp += (hn << 2) - 1;
874 n = hn;
875 } while (n > 1);
876
877 GF2XVec stk;
878 stk.SetSize(sp + 2*(sa+sb)-1, 2*GF2E::WordLength());
879
880 long i;
881
882 for (i = 0; i < sa; i++)
883 stk[i+sa+sb-1] = rep(a.rep[i]);
884
885 for (i = 0; i < sb; i++)
886 stk[i+2*sa+sb-1] = rep(b.rep[i]);
887
888 KarMul(&stk[0], &stk[sa+sb-1], sa, &stk[2*sa+sb-1], sb,
889 &stk[2*(sa+sb)-1]);
890
891 c.rep.SetLength(sa+sb-1);
892
893 for (i = 0; i < sa+sb-1; i++)
894 conv(c.rep[i], stk[i]);
895
896 c.normalize();
897 }
898 #endif
899
900
901
902
903
810904 void MulTrunc(GF2EX& x, const GF2EX& a, const GF2EX& b, long n)
811905 {
812906 GF2EX t;
11731267 }
11741268
11751269
1176 void GCD(GF2EX& x, const GF2EX& a, const GF2EX& b)
1270 void PlainGCD(GF2EX& x, const GF2EX& a, const GF2EX& b)
11771271 {
11781272 GF2E t;
11791273
12061300 mul(x, x, t);
12071301 }
12081302
1303 class _NTL_GF2EXMatrix {
1304 private:
1305
1306 _NTL_GF2EXMatrix(const _NTL_GF2EXMatrix&); // disable
1307 GF2EX elts[2][2];
1308
1309 public:
1310
1311 _NTL_GF2EXMatrix() { }
1312 ~_NTL_GF2EXMatrix() { }
1313
1314 void operator=(const _NTL_GF2EXMatrix&);
1315 GF2EX& operator() (long i, long j) { return elts[i][j]; }
1316 const GF2EX& operator() (long i, long j) const { return elts[i][j]; }
1317 };
1318
1319
1320 void _NTL_GF2EXMatrix::operator=(const _NTL_GF2EXMatrix& M)
1321 {
1322 elts[0][0] = M.elts[0][0];
1323 elts[0][1] = M.elts[0][1];
1324 elts[1][0] = M.elts[1][0];
1325 elts[1][1] = M.elts[1][1];
1326 }
1327
1328
1329 static
1330 void mul(GF2EX& U, GF2EX& V, const _NTL_GF2EXMatrix& M)
1331 // (U, V)^T = M*(U, V)^T
1332 {
1333 GF2EX t1, t2, t3;
1334
1335 mul(t1, M(0,0), U);
1336 mul(t2, M(0,1), V);
1337 add(t3, t1, t2);
1338 mul(t1, M(1,0), U);
1339 mul(t2, M(1,1), V);
1340 add(V, t1, t2);
1341 U = t3;
1342 }
1343
1344
1345 static
1346 void mul(_NTL_GF2EXMatrix& A, _NTL_GF2EXMatrix& B, _NTL_GF2EXMatrix& C)
1347 // A = B*C, B and C are destroyed
1348 {
1349 GF2EX t1, t2;
1350
1351 mul(t1, B(0,0), C(0,0));
1352 mul(t2, B(0,1), C(1,0));
1353 add(A(0,0), t1, t2);
1354
1355 mul(t1, B(1,0), C(0,0));
1356 mul(t2, B(1,1), C(1,0));
1357 add(A(1,0), t1, t2);
1358
1359 mul(t1, B(0,0), C(0,1));
1360 mul(t2, B(0,1), C(1,1));
1361 add(A(0,1), t1, t2);
1362
1363 mul(t1, B(1,0), C(0,1));
1364 mul(t2, B(1,1), C(1,1));
1365 add(A(1,1), t1, t2);
1366
1367 long i, j;
1368 for (i = 0; i < 2; i++) {
1369 for (j = 0; j < 2; j++) {
1370 B(i,j).kill();
1371 C(i,j).kill();
1372 }
1373 }
1374 }
1375
1376
1377 void IterHalfGCD(_NTL_GF2EXMatrix& M_out, GF2EX& U, GF2EX& V, long d_red)
1378 {
1379 M_out(0,0).SetMaxLength(d_red);
1380 M_out(0,1).SetMaxLength(d_red);
1381 M_out(1,0).SetMaxLength(d_red);
1382 M_out(1,1).SetMaxLength(d_red);
1383
1384 set(M_out(0,0)); clear(M_out(0,1));
1385 clear(M_out(1,0)); set(M_out(1,1));
1386
1387 long goal = deg(U) - d_red;
1388
1389 if (deg(V) <= goal)
1390 return;
1391
1392 GF2EX Q, t(INIT_SIZE, d_red);
1393
1394 while (deg(V) > goal) {
1395 PlainDivRem(Q, U, U, V);
1396 swap(U, V);
1397
1398 mul(t, Q, M_out(1,0));
1399 sub(t, M_out(0,0), t);
1400 M_out(0,0) = M_out(1,0);
1401 M_out(1,0) = t;
1402
1403 mul(t, Q, M_out(1,1));
1404 sub(t, M_out(0,1), t);
1405 M_out(0,1) = M_out(1,1);
1406 M_out(1,1) = t;
1407 }
1408 }
1409
1410
1411 #define NTL_GF2EX_HalfGCD_CROSSOVER (40)
1412
1413
1414 void HalfGCD(_NTL_GF2EXMatrix& M_out, const GF2EX& U, const GF2EX& V, long d_red)
1415 {
1416 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
1417 set(M_out(0,0)); clear(M_out(0,1));
1418 clear(M_out(1,0)); set(M_out(1,1));
1419
1420 return;
1421 }
1422
1423
1424 long n = deg(U) - 2*d_red + 2;
1425 if (n < 0) n = 0;
1426
1427 GF2EX U1, V1;
1428
1429 RightShift(U1, U, n);
1430 RightShift(V1, V, n);
1431
1432 if (d_red <= NTL_GF2EX_HalfGCD_CROSSOVER) {
1433 IterHalfGCD(M_out, U1, V1, d_red);
1434 return;
1435 }
1436
1437 long d1 = (d_red + 1)/2;
1438 if (d1 < 1) d1 = 1;
1439 if (d1 >= d_red) d1 = d_red - 1;
1440
1441 _NTL_GF2EXMatrix M1;
1442
1443 HalfGCD(M1, U1, V1, d1);
1444 mul(U1, V1, M1);
1445
1446 long d2 = deg(V1) - deg(U) + n + d_red;
1447
1448 if (IsZero(V1) || d2 <= 0) {
1449 M_out = M1;
1450 return;
1451 }
1452
1453
1454 GF2EX Q;
1455 _NTL_GF2EXMatrix M2;
1456
1457 DivRem(Q, U1, U1, V1);
1458 swap(U1, V1);
1459
1460 HalfGCD(M2, U1, V1, d2);
1461
1462 GF2EX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
1463
1464 mul(t, Q, M1(1,0));
1465 sub(t, M1(0,0), t);
1466 swap(M1(0,0), M1(1,0));
1467 swap(M1(1,0), t);
1468
1469 t.kill();
1470
1471 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
1472
1473 mul(t, Q, M1(1,1));
1474 sub(t, M1(0,1), t);
1475 swap(M1(0,1), M1(1,1));
1476 swap(M1(1,1), t);
1477
1478 t.kill();
1479
1480 mul(M_out, M2, M1);
1481 }
1482
1483 void XHalfGCD(_NTL_GF2EXMatrix& M_out, GF2EX& U, GF2EX& V, long d_red)
1484 {
1485 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
1486 set(M_out(0,0)); clear(M_out(0,1));
1487 clear(M_out(1,0)); set(M_out(1,1));
1488
1489 return;
1490 }
1491
1492 long du = deg(U);
1493
1494 if (d_red <= NTL_GF2EX_HalfGCD_CROSSOVER) {
1495 IterHalfGCD(M_out, U, V, d_red);
1496 return;
1497 }
1498
1499 long d1 = (d_red + 1)/2;
1500 if (d1 < 1) d1 = 1;
1501 if (d1 >= d_red) d1 = d_red - 1;
1502
1503 //ZZ_pXMatrix M1;
1504 _NTL_GF2EXMatrix M1;
1505
1506 HalfGCD(M1, U, V, d1);
1507 mul(U, V, M1);
1508
1509 long d2 = deg(V) - du + d_red;
1510
1511 if (IsZero(V) || d2 <= 0) {
1512 M_out = M1;
1513 return;
1514 }
1515
1516
1517 GF2EX Q;
1518 _NTL_GF2EXMatrix M2;
1519
1520 DivRem(Q, U, U, V);
1521 swap(U, V);
1522
1523 XHalfGCD(M2, U, V, d2);
1524
1525 GF2EX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
1526
1527 mul(t, Q, M1(1,0));
1528 sub(t, M1(0,0), t);
1529 swap(M1(0,0), M1(1,0));
1530 swap(M1(1,0), t);
1531
1532 t.kill();
1533
1534 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
1535
1536 mul(t, Q, M1(1,1));
1537 sub(t, M1(0,1), t);
1538 swap(M1(0,1), M1(1,1));
1539 swap(M1(1,1), t);
1540
1541 t.kill();
1542
1543 mul(M_out, M2, M1);
1544 }
1545
1546 void HalfGCD(GF2EX& U, GF2EX& V)
1547 {
1548 long d_red = (deg(U)+1)/2;
1549
1550 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
1551 return;
1552 }
1553
1554 long du = deg(U);
1555
1556
1557 long d1 = (d_red + 1)/2;
1558 if (d1 < 1) d1 = 1;
1559 if (d1 >= d_red) d1 = d_red - 1;
1560
1561 _NTL_GF2EXMatrix M1;
1562
1563 HalfGCD(M1, U, V, d1);
1564 mul(U, V, M1);
1565
1566 long d2 = deg(V) - du + d_red;
1567
1568 if (IsZero(V) || d2 <= 0) {
1569 return;
1570 }
1571
1572 M1(0,0).kill();
1573 M1(0,1).kill();
1574 M1(1,0).kill();
1575 M1(1,1).kill();
1576
1577
1578 GF2EX Q;
1579
1580 DivRem(Q, U, U, V);
1581 swap(U, V);
1582
1583 HalfGCD(M1, U, V, d2);
1584
1585 mul(U, V, M1);
1586 }
1587
1588
1589 void GCD(GF2EX& d, const GF2EX& u, const GF2EX& v)
1590 {
1591 GF2EX u1, v1;
1592
1593 u1 = u;
1594 v1 = v;
1595
1596 if (deg(u1) == deg(v1)) {
1597 if (IsZero(u1)) {
1598 clear(d);
1599 return;
1600 }
1601
1602 rem(v1, v1, u1);
1603 }
1604 else if (deg(u1) < deg(v1)) {
1605 swap(u1, v1);
1606 }
1607
1608 // deg(u1) > deg(v1)
1609
1610 while (deg(u1) >= GF2E::GCDCross() && !IsZero(v1)) {
1611 HalfGCD(u1, v1);
1612
1613 if (!IsZero(v1)) {
1614 rem(u1, u1, v1);
1615 swap(u1, v1);
1616 }
1617 }
1618
1619 PlainGCD(d, u1, v1);
1620 }
12091621
12101622
12111623
12121624
1625
12131626 void XGCD(GF2EX& d, GF2EX& s, GF2EX& t, const GF2EX& a, const GF2EX& b)
12141627 {
1215 GF2E z;
1216
1217
1218 if (IsZero(b)) {
1628 GF2E w;
1629
1630 if (IsZero(a) && IsZero(b)) {
1631 clear(d);
12191632 set(s);
12201633 clear(t);
1221 d = a;
1222 }
1223 else if (IsZero(a)) {
1224 clear(s);
1225 set(t);
1226 d = b;
1227 }
1228 else {
1229 long e = max(deg(a), deg(b)) + 1;
1230
1231 GF2EX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1232 u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1233 u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1234 u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1235
1236
1237 set(u1); clear(v1);
1238 clear(u2); set(v2);
1239 u = a; v = b;
1240
1241 do {
1242 DivRem(q, u, u, v);
1243 swap(u, v);
1244 u0 = u2;
1245 v0 = v2;
1246 mul(temp, q, u2);
1247 add(u2, u1, temp);
1248 mul(temp, q, v2);
1249 add(v2, v1, temp);
1250 u1 = u0;
1251 v1 = v0;
1252 } while (!IsZero(v));
1253
1254 d = u;
1255 s = u1;
1256 t = v1;
1257 }
1258
1259 if (IsZero(d)) return;
1260 if (IsOne(LeadCoeff(d))) return;
1261
1262 /* make gcd monic */
1263
1264 inv(z, LeadCoeff(d));
1265 mul(d, d, z);
1266 mul(s, s, z);
1267 mul(t, t, z);
1634 return;
1635 }
1636
1637 GF2EX U, V, Q;
1638
1639 U = a;
1640 V = b;
1641
1642 long flag = 0;
1643
1644 if (deg(U) == deg(V)) {
1645 DivRem(Q, U, U, V);
1646 swap(U, V);
1647 flag = 1;
1648 }
1649 else if (deg(U) < deg(V)) {
1650 swap(U, V);
1651 flag = 2;
1652 }
1653
1654 _NTL_GF2EXMatrix M;
1655
1656 XHalfGCD(M, U, V, deg(U)+1);
1657
1658 d = U;
1659
1660 if (flag == 0) {
1661 s = M(0,0);
1662 t = M(0,1);
1663 }
1664 else if (flag == 1) {
1665 s = M(0,1);
1666 mul(t, Q, M(0,1));
1667 sub(t, M(0,0), t);
1668 }
1669 else { /* flag == 2 */
1670 s = M(0,1);
1671 t = M(0,0);
1672 }
1673
1674 // normalize
1675
1676 inv(w, LeadCoeff(d));
1677 mul(d, d, w);
1678 mul(s, s, w);
1679 mul(t, t, w);
12681680 }
12691681
12701682
15581970 }
15591971 }
15601972
1973
1974 #if 0
1975 // used only for computing ModCross using GF2EXModCross.cpp
1976 void BuildPlain(GF2EXModulus& F, const GF2EX& f, bool plain)
1977 {
1978 long n = deg(f);
1979
1980 if (n <= 0) LogicError("build(GF2EXModulus,GF2EX): deg(f) <= 0");
1981
1982 if (NTL_OVERFLOW(n, GF2E::degree(), 0))
1983 ResourceError("build(GF2EXModulus,GF2EX): overflow");
1984
1985 F.tracevec.make();
1986
1987 F.f = f;
1988 F.n = n;
1989
1990 if (plain) {
1991 F.method = GF2EX_MOD_PLAIN;
1992 }
1993 else {
1994 F.method = GF2EX_MOD_MUL;
1995 GF2EX P1;
1996 GF2EX P2;
1997
1998 CopyReverse(P1, f, n);
1999 InvTrunc(P2, P1, n-1);
2000 CopyReverse(P1, P2, n-2);
2001 trunc(F.h0, P1, n-2);
2002 trunc(F.f0, f, n);
2003 F.hlc = ConstTerm(P2);
2004 }
2005 }
2006 #endif
2007
15612008 GF2EXModulus::GF2EXModulus()
15622009 {
15632010 n = -1;
20562503 DivRem(q, r, a, B);
20572504 }
20582505 }
2506
2507 #if 0
2508 // used only for computing DivCross using GF2EXDivCross.cpp
2509 void DivRemPlain(GF2EX& q, GF2EX& r, const GF2EX& a, const GF2EX& b, bool plain)
2510 {
2511 long sa = a.rep.length();
2512 long sb = b.rep.length();
2513
2514 if (plain)
2515 PlainDivRem(q, r, a, b);
2516 else if (sa < 4*sb)
2517 UseMulDivRem(q, r, a, b);
2518 else {
2519 GF2EXModulus B;
2520 build(B, b);
2521 DivRem(q, r, a, B);
2522 }
2523 }
2524 #endif
20592525
20602526 void div(GF2EX& q, const GF2EX& a, const GF2EX& b)
20612527 {
0 #include <NTL/GF2EX.h>
1 #include <NTL/GF2XFactoring.h>
2
3 namespace NTL {
4
5 void DivRemPlain(GF2EX& q, GF2EX& r, const GF2EX& a, const GF2EX& b, bool plain);
6
7 }
8
9 NTL_CLIENT
10
11
12 #define TIME_IT(t, action) \
13 do { \
14 double _t0, _t1; \
15 long _iter = 1; \
16 long _cnt = 0; \
17 do { \
18 _t0 = GetTime(); \
19 for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
20 _t1 = GetTime(); \
21 } while ( _t1 - _t0 < 2 && (_iter *= 2)); \
22 t = (_t1 - _t0)/_iter; \
23 } while(0)
24
25
26 long test(long k)
27 {
28 GF2X P;
29
30 BuildIrred(P, k);
31 GF2EPush push(P);
32
33 for (long n = 25; ; n+=25) {
34 cerr << ",";
35 GF2EX a, b, q, r;
36 random(a, 2*n);
37 random(b, n);
38 double t1, t2;
39 TIME_IT(t1, DivRemPlain(q, r, a, b, false));
40 TIME_IT(t2, DivRemPlain(q, r, a, b, true));
41 double t = t1/t2;
42 if (t <= 0.95) return n;
43 }
44 }
45
46 int main()
47 {
48 cerr << "0.5 " << test(32) << "\n";
49 for (long i = 1; i <= 50; i++) {
50 cerr << i << " " << test(64*i) << "\n";
51 }
52
53 for (long i = 75; i <= 200 ; i+=25) {
54 cerr << i << " " << test(64*i) << "\n";
55 }
56 }
57
58
0 #include <NTL/GF2EX.h>
1 #include <NTL/GF2XFactoring.h>
2
3 namespace NTL {
4
5 void HalfGCD(GF2EX&,GF2EX&);
6 void PlainRem(GF2EX& r, const GF2EX& a, const GF2EX& b, GF2XVec& x);
7
8 }
9
10 NTL_CLIENT
11
12
13 #define TIME_IT(t, action) \
14 do { \
15 double _t0, _t1; \
16 long _iter = 1; \
17 long _cnt = 0; \
18 do { \
19 _t0 = GetTime(); \
20 for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
21 _t1 = GetTime(); \
22 } while ( _t1 - _t0 < 2 && (_iter *= 2)); \
23 t = (_t1 - _t0)/_iter; \
24 } while(0)
25
26
27
28 void TestGCD1(long bnd, const GF2EX& a, const GF2EX& b)
29 {
30
31 long n = deg(a) + 1;
32 GF2EX u(INIT_SIZE, n), v(INIT_SIZE, n);
33 GF2XVec tmp(n, 2*GF2E::WordLength());
34
35 u = a;
36 v = b;
37 while (deg(v) > bnd) {
38 PlainRem(u, u, v, tmp);
39 swap(u, v);
40 }
41
42 }
43
44
45 long test(long k)
46 {
47 GF2X P;
48
49 BuildIrred(P, k);
50 GF2EPush push(P);
51
52 for (long n = 42; ; n = long(n*1.4)) {
53 cerr << ",";
54 GF2EX d, a, b, u, v;
55 random(a, n);
56 SetCoeff(a, n);
57 random(b, n);
58 double t1, t2;
59 TIME_IT(t1, u=a; v=b; HalfGCD(u, v));
60 TIME_IT(t2, TestGCD1(deg(v), a, b));
61 double t = t1/t2;
62 if (t <= 1) return n;
63 }
64 }
65
66 int main()
67 {
68 #if 1
69 cerr << "0.5 " << test(32) << "\n";
70 for (long i = 1; i <= 4; i+=1) {
71 cerr << i << " " << test(64*i) << "\n";
72 }
73 for (long i = 8; i <= 16; i+=4) {
74 cerr << i << " " << test(64*i) << "\n";
75 }
76 #endif
77 for (long i = 24; i <= 48; i+=8) {
78 cerr << i << " " << test(64*i) << "\n";
79 }
80 }
81
82
0 #include <NTL/GF2XFactoring.h>
1 #include <NTL/GF2EX.h>
2
3 NTL_CLIENT
4
5
6
7 void test(GF2X& P, GF2EX& f, GF2EX& g, GF2EX& h, GF2EX& hx, GF2EX& s, GF2EX& t)
8 {
9 /* P is the polynomial of the extension
10 * f and g the polynomials
11 * h the gcd
12 * hx the gcd obtained using XGCD
13 * s, t are Bezout coefficients hx=f*s+g*t
14 */
15 GF2EX htest,rf,rg;
16
17 if (h!=hx){
18 cout << P << "\n" << f << "\n" << g << "\n";
19 Error("different gcd:\n");
20 }
21
22 if (max(deg(f), deg(g)) > 0 || min(deg(f), deg(g)) >= 0) {
23 if (deg(s) >= deg(g) || deg(t) >= deg(f)) {
24 cout << P << "\n" << f << "\n" << g << "\n";
25 Error("degree bounds at fault:\n");
26 }
27 }
28
29
30 mul(s,s,f);
31 mul(t,t,g);
32 add(htest,t,s);
33 if (h!=htest){
34 cout << P << "\n" << f << "\n" << g << "\n";
35 Error("xgcd at fault:\n");
36 }
37 if (!IsZero(h)){
38 rem(rf,f,h);
39 rem(rg,f,h);
40 if ((!IsZero(rf))||(!IsZero(rg))){
41 cout << P << "\n" << f << "\n" << g << "\n";
42 Error("not a common divisor\n");
43 }
44 }else{
45 if (!IsZero(f) && !IsZero(g)){
46 cout << "debug:\n";
47 cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
48 Error("ooops:\n");
49 }
50 }
51 }
52
53
54 int main()
55 {
56
57 GF2X P;
58
59 BuildIrred(P, 128);
60
61 GF2E::init(P);
62
63 for (long i = 0; i < 400; i++) {
64 if (i%10 == 0) cerr << ".";
65 GF2EX f,g,h,s,t,hx;
66
67 long deg_h;
68 if (RandomBnd(2))
69 deg_h = RandomBnd(10)+1;
70 else
71 deg_h = RandomBnd(500)+1;
72
73 random(h, deg_h);
74 SetCoeff(h, deg_h);
75
76 long deg_f;
77 if (RandomBnd(2))
78 deg_f = RandomBnd(10)+1;
79 else
80 deg_f = RandomBnd(1000)+1;
81
82 random(f, deg_f);
83 f *= h;
84
85 long deg_g;
86 if (RandomBnd(2))
87 deg_g = RandomBnd(10)+1;
88 else
89 deg_g = RandomBnd(1000)+1;
90
91 random(g, deg_g);
92 g *= h;
93
94 h = 0;
95
96 GCD(h, f, g);
97 XGCD(hx, s, t, f, g);
98 test(P, f, g, h, hx, s, t);
99 }
100
101 cerr << "\n";
102
103 }
0 #include <NTL/GF2EX.h>
1 #include <NTL/GF2XFactoring.h>
2
3 namespace NTL {
4
5 void PlainMul(GF2EX&,const GF2EX&,const GF2EX&);
6 void mul_disable_plain(GF2EX&,const GF2EX&,const GF2EX&);
7
8 }
9
10 NTL_CLIENT
11
12
13 #define TIME_IT(t, action) \
14 do { \
15 double _t0, _t1; \
16 long _iter = 1; \
17 long _cnt = 0; \
18 do { \
19 _t0 = GetTime(); \
20 for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
21 _t1 = GetTime(); \
22 } while ( _t1 - _t0 < 2 && (_iter *= 2)); \
23 t = (_t1 - _t0)/_iter; \
24 } while(0)
25
26
27 long test(long k)
28 {
29 GF2X P;
30
31 BuildIrred(P, k);
32 GF2EPush push(P);
33
34 for (long n = 2; ; n++) {
35 cerr << ",";
36 GF2EX a, b, c;
37 random(a, n);
38 random(b, n);
39 double t1, t2;
40 TIME_IT(t1, mul_disable_plain(c, a, b));
41 TIME_IT(t2, PlainMul(c, a, b));
42 double t = t1/t2;
43 if (t <= 0.95) return n;
44 }
45 }
46
47 int main()
48 {
49 cerr << "0.5 " << test(32) << "\n";
50 for (long i = 1; i <= 40; i++) {
51 cerr << i << " " << test(64*i) << "\n";
52 }
53 }
54
55
0 #include <NTL/GF2EX.h>
1 #include <NTL/GF2XFactoring.h>
2
3 namespace NTL {
4
5 void BuildPlain(GF2EXModulus& F, const GF2EX& f, bool plain);
6
7 }
8
9 NTL_CLIENT
10
11
12 #define TIME_IT(t, action) \
13 do { \
14 double _t0, _t1; \
15 long _iter = 1; \
16 long _cnt = 0; \
17 do { \
18 _t0 = GetTime(); \
19 for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
20 _t1 = GetTime(); \
21 } while ( _t1 - _t0 < 2 && (_iter *= 2)); \
22 t = (_t1 - _t0)/_iter; \
23 } while(0)
24
25
26 long test(long k)
27 {
28 GF2X P;
29
30 BuildIrred(P, k);
31 GF2EPush push(P);
32
33 for (long n = 5; ; n+=5) {
34 cerr << ",";
35 GF2EX a, r, f;
36 random(a, 2*n-1);
37 random(f, n);
38 SetCoeff(f, n);
39 GF2EXModulus F1, F2;
40 BuildPlain(F1, f, false);
41 BuildPlain(F2, f, true);
42 double t1, t2;
43 TIME_IT(t1, rem(r, a, F1));
44 TIME_IT(t2, rem(r, a, F2));
45 double t = t1/t2;
46 if (t <= 0.95) return n;
47 }
48 }
49
50 int main()
51 {
52 cerr << "0.5 " << test(32) << "\n";
53 for (long i = 1; i <= 40 ; i++) {
54 cerr << i << " " << test(64*i) << "\n";
55 }
56 }
57
58
165165 cout << "NTL_SAFE_VECTORS=0\n";
166166 #endif
167167
168 #ifdef NTL_ENABLE_AVX_FFT
169 cout << "NTL_ENABLE_AVX_FFT=1\n";
170 #else
171 cout << "NTL_ENABLE_AVX_FFT=0\n";
172 #endif
173
174 #ifdef NTL_AVOID_AVX512
175 cout << "NTL_AVOID_AVX512=1\n";
176 #else
177 cout << "NTL_AVOID_AVX512=0\n";
178 #endif
179
168180 #ifdef NTL_RANGE_CHECK
169181 cout << "NTL_RANGE_CHECK=1\n";
170182 #else
172184 #endif
173185
174186
187
175188 // the following are not actual config flags, but help
176189 // in the Wizard logic
177190
0
01
12 -----------------------------
23 These are basically notes to myself on preparing a new
2728
2829 =====================================
2930
31 TODO: add a runtime flag that makes GetTime call GetWallTime
32
3033 FIXME: maybe it would make more sense to take the +1/-1 logic
3134 out of [cg]_lip_impl block_construct routines and just put it in
3235 the caller: the ZZ_p and ZZVec BlockConstruct stuff: add 1 there...
88
99 quad_float::SetOutputPrecision(25);
1010
11 if (PrecisionOK())
11 long pok;
12 double one = 1.0;
13 quad_float_PrecisionOK(pok, one);
14 if (pok)
1215 cout << "Precision OK\n";
1316 else
1417 cout << "Precision not OK\n";
256256 cerr << "NTL_SAFE_VECTORS\n";
257257 #endif
258258
259 #ifdef NTL_ENABLE_AVX_FFT
260 cerr << "NTL_ENABLE_AVX_FFT\n";
261 #endif
262
263 #ifdef NTL_AVOID_AVX512
264 cerr << "NTL_AVOID_AVX512\n";
265 #endif
266
259267 #ifdef NTL_RANGE_CHECK
260268 cerr << "NTL_RANGE_CHECK\n";
261269 #endif
0 #include <NTL/ZZX.h>
1
2 NTL_CLIENT
3
4
5 #define TIME_IT(t, action) \
6 do { \
7 double _t0, _t1; \
8 long _iter = 1; \
9 long _cnt = 0; \
10 do { \
11 _t0 = GetTime(); \
12 for (long _i = 0; _i < _iter; _i++) { action; _cnt++; } \
13 _t1 = GetTime(); \
14 } while ( _t1 - _t0 < 2 && (_iter *= 2)); \
15 t = (_t1 - _t0)/_iter; \
16 } while(0)
17
18 void FillRandom(ZZX& f, long n, long k)
19 {
20 long sw = RandomBnd(2);
21 f.SetLength(n);
22 for (long i = 0; i < n; i++) {
23 if (sw) {
24 long kk = 1 + RandomBnd(k);
25 RandomBits(f[i], kk);
26 }
27 else {
28 long kk = RandomBnd(k);
29 SetBit(f[i], kk);
30 }
31 if (RandomBnd(2)) NTL::negate(f[i], f[i]);
32 }
33 f.normalize();
34 }
35
36 int main()
37 {
38
39 for (long iter = 0; iter < 4000; iter++) {
40 if (iter % 100 == 0) cerr << ".";
41 long na, nb, k;
42
43 long sw = RandomBnd(3);
44
45 if (sw == 0) {
46 na = RandomBnd(20) + 1;
47 nb = RandomBnd(20) + 1;
48 k = RandomBnd(20) + 1;
49 }
50 else if (sw == 1) {
51 na = RandomBnd(200) + 10;
52 nb = RandomBnd(200) + 10;
53 k = RandomBnd(200) + 10;
54 }
55 else {
56 na = RandomBnd(3000) + 100;
57 nb = RandomBnd(3000) + 100;
58 k = RandomBnd(3000) + 100;
59 }
60
61 ZZX a, b, c, c1;
62 FillRandom(a, na, k);
63 FillRandom(b, nb, k);
64
65 if (RandomBnd(2)) {
66 SSMul(c, a, b);
67 KarMul(c1, a, b);
68 if (c != c1) Error("oops");
69 }
70 else {
71 SSSqr(c, a);
72 KarSqr(c1, a);
73 if (c != c1) Error("oops");
74 }
75 }
76
77 cerr << "\n";
78 }
79
80
66 echo "running ZZTest"
77 ./ZZTest
88 sh RemoveProg ZZTest
9
10 echo
11 echo "---------------------------------"
12 echo "making SSMulTest"
13 $MAKE_PROG SSMulTest
14 echo "running SSMulTest"
15 ./SSMulTest
16 sh RemoveProg SSMulTest
17
18
19 echo
20 echo "---------------------------------"
21 echo "making ZZ_pXTest"
22 $MAKE_PROG ZZ_pXTest
23 echo "running ZZ_pXTest"
24 ./ZZ_pXTest
25 sh RemoveProg ZZ_pXTest
26
27 echo
28 echo "---------------------------------"
29 echo "making lzz_pXTest"
30 $MAKE_PROG lzz_pXTest
31 echo "running lzz_pXTest"
32 ./lzz_pXTest
33 sh RemoveProg lzz_pXTest
934
1035 echo
1136 echo "---------------------------------"
7499 ./GF2EXTest
75100 sh RemoveProg GF2EXTest
76101
102
103 echo
104 echo "---------------------------------"
105 echo "making GF2EXGCDTest"
106 $MAKE_PROG GF2EXGCDTest
107 echo "running GF2EXGCDTest"
108 ./GF2EXGCDTest
109 sh RemoveProg GF2EXGCDTest
77110
78111 echo
79112 echo "---------------------------------"
188221 ./ZZ_pEXTest
189222 sh RemoveProg ZZ_pEXTest
190223
224 echo
225 echo "---------------------------------"
226 echo "making ZZ_pEXGCDTest"
227 $MAKE_PROG ZZ_pEXGCDTest
228 echo "running ZZ_pEXGCDTest"
229 ./ZZ_pEXGCDTest
230 sh RemoveProg ZZ_pEXGCDTest
231
232
191233
192234 echo
193235 echo "---------------------------------"
200242
201243 echo
202244 echo "---------------------------------"
245 echo "making lzz_pEXGCDTest"
246 $MAKE_PROG lzz_pEXGCDTest
247 echo "running lzz_pEXGCDTest"
248 ./lzz_pEXGCDTest
249 sh RemoveProg lzz_pEXGCDTest
250
251
252 echo
253 echo "---------------------------------"
203254 echo "making ThreadTest"
204255 $MAKE_PROG ThreadTest
205256 echo "running ThreadTest"
0 36:0:0
0 39:0:0
0 WinNTL-11_0_0
0 WinNTL-11_3_0
5959
6060 sh CopyFeatures '..' small "$3"
6161 cp ../include/NTL/FFT.h small/include/NTL
62 cp ../include/NTL/FFT_impl.h small/include/NTL
6263 cp ../include/NTL/ctools.h small/include/NTL
6364 cp ../include/NTL/ZZ.h small/include/NTL
6465 cp ../include/NTL/sp_arith.h small/include/NTL
148148 next;
149149 }
150150
151 print "run: $aflag1 $bflag1 $cflag1 NTL_FFT_BIGTAB\n";
151 print "run: $aflag1 $bflag1 $cflag1\n";
152152 GenConfigHeader();
153153 $time1 = RunProg("Poly1TimeTest");
154154
173173
174174 # now see if BIGTAB really helps
175175
176
177176 $Config{"NTL_FFT_BIGTAB"} = 0;
178177 print "run: $aflag $bflag $cflag default\n";
179178 GenConfigHeader();
5454
5555 if (frozen) LogicError("Cannot grow this WordVector");
5656
57 m = max(n, long(NTL_WordVectorExpansionRatio*max_length));
57 m = max(n, _ntl_vec_grow(max_length));
5858
5959 m = ((m+NTL_WordVectorMinAlloc-1)/NTL_WordVectorMinAlloc)*NTL_WordVectorMinAlloc;
6060 _ntl_ulong *p = rep - 2;
14451445
14461446
14471447
1448 void sub(ZZ& x, const ZZ& a, long b)
1449 {
1450 NTL_ZZRegister(B);
1451 conv(B, b);
1452 sub(x, a, B);
1453 }
1454
14551448 void sub(ZZ& x, long a, const ZZ& b)
14561449 {
14571450 NTL_ZZRegister(A);
19361929 }
19371930
19381931
1939 void old_RandomStream::do_get(unsigned char *NTL_RESTRICT res, long n)
1932 void old_RandomStream::do_get(unsigned char *res, long n)
19401933 {
19411934 if (n < 0) LogicError("RandomStream::get: bad args");
19421935
5959 CHECK(q1.validate() && r1.validate() && q == q1 && r == r1);
6060 }
6161
62 cerr << "\nvalidating mul...";
63 for (long i = 0; i < 1000000; i++) {
64 long a_len = RandomBnd(1000)+1;
65 long b_len = RandomBnd(1000)+1;
66
67 ZZ a, b, c;
68
69 RandomLen(a, a_len);
70 RandomLen(b, b_len);
71
72 if (RandomBnd(2)) a = -a;
73 if (RandomBnd(2)) b = -b;
74
75 long p = 7919;
76 long r = MulMod(rem(a, p), rem(b, p), p);
77 long s = MulMod(rem(a, p), rem(a, p), p);
78
79 switch (RandomBnd(5)) {
80 case 0:
81 mul(c, a, b);
82 CHECK(c.validate() && rem(c, p) == r);
83 break;
84
85 case 1:
86 mul(a, a, b);
87 CHECK(a.validate() && rem(a, p) == r);
88 break;
89
90 case 2:
91 mul(b, a, b);
92 CHECK(b.validate() && rem(b, p) == r);
93 break;
94
95 case 3:
96 mul(c, a, a);
97 CHECK(c.validate() && rem(c, p) == s);
98 break;
99
100 case 4:
101 mul(a, a, a);
102 CHECK(a.validate() && rem(a, p) == s);
103 break;
104 }
105 }
106
62107 cerr << "\nvalidating squaring...";
63 for (long i = 0; i < 200000; i++) {
64 long a_len = RandomBnd(8000)+5;
65
66 ZZ a, b, a1, c;
67 RandomLen(a, a_len);
68
69 sqr(b, a);
108 for (long i = 0; i < 1000000; i++) {
109 long a_len = RandomBnd(1000)+1;
110
111 ZZ a, b, a1, a2, c;
112 RandomLen(a, a_len);
113
114 if (RandomBnd(2)) a = -a;
115
70116 a1 = a;
71 mul(c, a, a1);
72
73 CHECK(b.validate() && c.validate() && b == c);
117 a2 = a;
118
119 if (RandomBnd(2)) {
120 sqr(b, a);
121 mul(c, a1, a2);
122 CHECK(b.validate() && c.validate() && b == c);
123 }
124 else {
125 sqr(a, a);
126 mul(c, a1, a2);
127 CHECK(a.validate() && c.validate() && a == c);
128 }
74129 }
75130
76131 cerr << "\nvalidating SqrRoot...";
155210 }
156211
157212 cerr << "\nvalidating GCD...";
158 for (long i = 0; i < 100000; i++) {
159 long a_len = RandomBnd(4000)+5;
160 long b_len = RandomBnd(4000)+5;
161 long c_len = RandomBnd(500)+1;
213 for (long i = 0; i < 1000000; i++) {
214 long a_len = RandomBnd(1000)+1;
215 long b_len = RandomBnd(1000)+1;
216 long c_len = RandomBnd(200)+1;
162217
163218 ZZ a, b, c;
164219 RandomLen(a, a_len);
167222
168223 a *= c;
169224 b *= c;
225
226 if (RandomBnd(2)) a = -a;
227 if (RandomBnd(2)) b = -b;
170228
171229 ZZ d, s, t, d1;
172230
176234 CHECK(d.validate() && s.validate() && t.validate() && d1.validate());
177235 CHECK(d == d1 && d == a*s + b*t);
178236 CHECK(divide(a, d) && divide(b, d));
237
238 CHECK(abs(s) <= 1 || 2*d*abs(s) < abs(b));
239 CHECK(abs(t) <= 1 || 2*d*abs(t) < abs(a));
240
241 if (a < 0) { a = -a; s = -s; }
242 if (b < 0) { b = -b; t = -t; }
243 if (a < b) { swap(a, b); swap(s, t); }
244
245 // so now we have a >= b >= 0
246 // check that s in (-b/2*d, b/2*d]
247 CHECK(2*d*s > -b && 2*d*s <= b);
179248 }
180249
181250 cerr << "\nvalidating InvMod...";
182251 for (long i = 0; i < 100000; i++) {
183 long n_len = RandomBnd(4000)+5;
252 long n_len = RandomBnd(4000)+4;
184253
185254 ZZ a, n, x;
186255 RandomLen(n, n_len);
187256 RandomBnd(a, n);
188257
189258 long r = InvModStatus(x, a, n);
190 CHECK((r == 0 && (x * a) % n == 1) || (r == 1 && x != 1 && x == GCD(a, n)) );
259 CHECK((r == 0 && (x * a) % n == 1 && 0 <= x && x < n) ||
260 (r == 1 && x != 1 && x == GCD(a, n)) );
191261 }
192262
193263 cerr << "\nvalidating RatRecon...";
756756
757757
758758
759
760 /* Compute a = b * 2^l mod p, where p = 2^n+1. 0<=l<=n and 0<b<p are
759 static void
760 SS_AddMod(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
761 // x = a + b mod p, where p = 2^n+1, a, b in [0, p).
762 // x may not alias p.
763 {
764 #ifndef NTL_PROVIDES_SS_LIP_IMPL
765 add(x, a, b);
766 if (x >= p) {
767 x--; SwitchBit(x, n); // x -= p
768 }
769 #else
770 SS_AddMod_lip_impl(x, a, b, p, n);
771 #endif
772 }
773
774 static void
775 SS_SubMod(ZZ& x, const ZZ& a, const ZZ& b, const ZZ& p, long n)
776 // x = a - b mod p, where p = 2^n+1, a, b in [0, p).
777 // x may not alias b or p.
778 {
779 #ifndef NTL_PROVIDES_SS_LIP_IMPL
780 if (a < b) {
781 add(x, a, p);
782 SubPos(x, x, b);
783 }
784 else {
785 SubPos(x, a, b);
786 }
787 #else
788 SS_SubMod_lip_impl(x, a, b, p, n);
789 #endif
790 }
791
792
793
794 /* Compute a = b * 2^e mod p, where p = 2^n+1. 0<=e<n and 0<b<p are
761795 assumed. */
762 static void LeftRotate(ZZ& a, const ZZ& b, long l, const ZZ& p, long n, ZZ& scratch)
763 {
764 if (l == 0) {
796
797 static void
798 LeftRotate(ZZ& a, const ZZ& b, long e, const ZZ& p, long n, ZZ& scratch)
799 {
800 #ifndef NTL_PROVIDES_SS_LIP_IMPL
801 if (e == 0) {
765802 if (&a != &b) {
766803 a = b;
767804 }
768805 return;
769806 }
770807
771 /* scratch := upper l bits of b */
772 RightShift(scratch, b, n - l);
773 /* a := 2^l * lower n - l bits of b */
774 trunc(a, b, n - l);
775 LeftShift(a, a, l);
808 /* scratch := upper e bits of b */
809 RightShift(scratch, b, n - e);
810 /* a := 2^e * lower n - e bits of b */
811 trunc(a, b, n - e);
812 LeftShift(a, a, e);
776813 /* a -= scratch */
777 sub(a, a, scratch);
778 if (sign(a) < 0) {
779 add(a, a, p);
780 }
781 }
782
783
784 /* Compute a = b * 2^l mod p, where p = 2^n+1. 0<=p<b is assumed. */
785 static void Rotate(ZZ& a, const ZZ& b, long l, const ZZ& p, long n, ZZ& scratch)
786 {
787 if (IsZero(b)) {
788 clear(a);
789 return;
790 }
791
792 /* l %= 2n */
793 if (l >= 0) {
794 l %= (n << 1);
795 } else {
796 l = (n << 1) - 1 - (-(l + 1) % (n << 1));
797 }
798
799 /* a = b * 2^l mod p */
800 if (l < n) {
801 LeftRotate(a, b, l, p, n, scratch);
802 } else {
803 LeftRotate(a, b, l - n, p, n, scratch);
804 SubPos(a, p, a);
805 }
806 }
807
808
809
810 /* Fast Fourier Transform. a is a vector of length 2^l, 2^l divides 2n,
811 p = 2^n+1, w = 2^r mod p is a primitive (2^l)th root of
812 unity. Returns a(1),a(w),...,a(w^{2^l-1}) mod p in bit-reverse
813 order. */
814 static void fft(ZZVec& a, long r, long l, const ZZ& p, long n)
815 {
816 long round;
817 long off, i, j, e;
818 long halfsize;
819 ZZ tmp, tmp1;
820 ZZ scratch;
821
822 for (round = 0; round < l; round++, r <<= 1) {
823 halfsize = 1L << (l - 1 - round);
824 for (i = (1L << round) - 1, off = 0; i >= 0; i--, off += halfsize) {
825 for (j = 0, e = 0; j < halfsize; j++, off++, e+=r) {
826 /* One butterfly :
827 ( a[off], a[off+halfsize] ) *= ( 1 w^{j2^round} )
828 ( 1 -w^{j2^round} ) */
829 /* tmp = a[off] - a[off + halfsize] mod p */
830 sub(tmp, a[off], a[off + halfsize]);
831 if (sign(tmp) < 0) {
832 add(tmp, tmp, p);
833 }
834 /* a[off] += a[off + halfsize] mod p */
835 add(a[off], a[off], a[off + halfsize]);
836 sub(tmp1, a[off], p);
837 if (sign(tmp1) >= 0) {
838 a[off] = tmp1;
839 }
840 /* a[off + halfsize] = tmp * w^{j2^round} mod p */
841 Rotate(a[off + halfsize], tmp, e, p, n, scratch);
842 }
814 SS_SubMod(a, a, scratch, p, n);
815 #else
816 LeftRotate_lip_impl(a, b, e, p, n, scratch);
817 #endif
818 }
819
820
821 #define SS_FFT_THRESH (4)
822 #define SS_NTEMPS (3)
823 #define SS_FFT_RDUP (3)
824
825 static long
826 SS_FFTRoundUp(long xn, long k)
827 {
828 long n = 1L << k;
829 if (xn <= 0) return n;
830
831 xn = ((xn+((1L << SS_FFT_RDUP)-1)) >> SS_FFT_RDUP) << SS_FFT_RDUP;
832
833 if (xn > n - (n >> 4)) xn = n;
834
835 return xn;
836 }
837
838
839
840 // p = 2^n+1, where n = r*2^{l-1}, so 2^r is primitive 2^l-th root
841 // of unity mod p.
842
843 // j in [0, 2^{level-1})
844 // a = b*2^{j*r*2^{l-level}}
845 static void
846 Rotate(ZZ& a, const ZZ& b, long j, long level,
847 long r, long l, const ZZ& p, long n, ZZ* tmp)
848 {
849 if (l-level >= 0)
850 LeftRotate(a, b, (j*r) << (l-level), p, n, tmp[0]);
851 else if (((j*r) & 1) == 0)
852 LeftRotate(a, b, (j*r) >> 1, p, n, tmp[0]);
853 else {
854 // use sqrt(2) = 2^{3n/4} - 2^{n/4}
855
856 long k = (j*r) >> 1; // j*r = 2*k + 1
857
858 // now compute a = b*2^{k+1/2} mod p
859
860 // a = b*{2^k} mod p
861 LeftRotate(a, b, k, p, n, tmp[0]);
862
863 // tmp[1] = a*2^{n/4} mod p
864 LeftRotate(tmp[1], a, n >> 2, p, n, tmp[0]);
865
866 // a = a*2^{3n/4} mod p
867 LeftRotate(a, a, 3*(n >> 2), p, n, tmp[0]);
868
869 // a -= tmp[1] mod p
870 SS_SubMod(a, a, tmp[1], p, n);
871 }
872 }
873
874
875 static void
876 SS_butterfly(ZZ& x, ZZ& y, const ZZ& p, long n, ZZ* tmp)
877 // (x, y) := (x+y, x-y)
878 {
879 /* tmp[0] = x - y mod p */
880 SS_SubMod(tmp[0], x, y, p, n);
881
882 /* x += y mod p */
883 SS_AddMod(x, x, y, p, n);
884
885 y = tmp[0];
886 }
887
888 static void
889 SS_fwd_butterfly(ZZ& x, ZZ& y, long j, long level,
890 long r, long l, const ZZ& p, long n,
891 ZZ* tmp)
892
893 // ( x, y ) *= ( 1 2^{j*r*2^{l-level}} )
894 // ( 1 -2^{j*r*2^{l-level}} )
895
896 {
897 /* tmp[0] = x - y mod p */
898 SS_SubMod(tmp[0], x, y, p, n);
899
900 /* x += y mod p */
901 SS_AddMod(x, x, y, p, n);
902
903 /* y = tmp[0] * 2^{j*r*2^{l-level}} mod p */
904 Rotate(y, tmp[0], j, level, r, l, p, n, tmp+1);
905 }
906
907 static void
908 SS_inv_butterfly(ZZ& x, ZZ& y, long j, long level,
909 long r, long l, const ZZ& p, long n,
910 ZZ* tmp)
911
912 // ( x, y ) *= ( 1 1 )
913 // ( 2^{-j*r*2^{l-level}} -2^{-j*r*2^{l-level}} )
914
915 // *** should not be called with j == 0
916 // call SS_butterfly instead
917
918 {
919 /* tmp[0] = y * 2^{(2^{level-1}-j)*r*2^{l-level}} mod p */
920 Rotate(tmp[0], y, (1L<<(level-1))-j, level, r, l, p, n, tmp+1);
921
922 /* y = x + tmp[0] mod p */
923 SS_AddMod(y, x, tmp[0], p, n); // NEGATED
924
925 /* x = x - tmp[0] mod p */
926 SS_SubMod(x, x, tmp[0], p, n); // NEGATED
927 }
928
929
930 // Much of the following logic is taken from the code in FFT.cpp
931 // for single-precision modular FFT's, which itself is adapted
932 // from code originally written by David Harvey.
933 // See copyright notice in FFT.cpp.
934
935 // size == 2^level
936 static void
937 fft_layer(ZZ* xp, long blocks, long size, long level, long r, long l,
938 const ZZ& p, long n, ZZ* tmp)
939 {
940 size /= 2;
941
942 do {
943 ZZ *xp0 = xp;
944 ZZ *xp1 = xp + size;
945
946 for (long j = 0; j < size; j++)
947 SS_fwd_butterfly(xp0[j], xp1[j], j, level, r, l, p, n, tmp);
948
949 xp += 2*size;
950 } while (--blocks != 0);
951 }
952
953 static void
954 fft_base(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
955 ZZ* tmp)
956 {
957 long N = 1L << lgN;
958
959 for (long j = lgN, size = N, blocks = 1;
960 j >= 1; j--, blocks <<= 1, size >>= 1)
961 fft_layer(xp, blocks, size, j, r, l, p, n, tmp);
962 }
963
964
965 static void
966 fft_rec(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
967 ZZ* tmp)
968 {
969 if (lgN <= SS_FFT_THRESH) {
970 fft_base(xp, lgN, r, l, p, n, tmp);
971 return;
972 }
973
974 long N = 1L << lgN;
975 long half = N >> 1;
976
977 ZZ *xp0 = xp;
978 ZZ *xp1 = xp + half;
979
980 for (long j = 0; j < half; j++)
981 SS_fwd_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
982
983 fft_rec(xp0, lgN-1, r, l, p, n, tmp);
984 fft_rec(xp1, lgN-1, r, l, p, n, tmp);
985 }
986
987
988 static void
989 fft_short(ZZ* xp, long yn, long xn, long lgN,
990 long r, long l, const ZZ& p, long n,
991 ZZ* tmp)
992 {
993 long N = 1L << lgN;
994
995 if (yn == N)
996 {
997 if (xn == N && lgN <= SS_FFT_THRESH)
998 {
999 // no truncation
1000 fft_base(xp, lgN, r, l, p, n, tmp);
1001 return;
1002 }
8431003 }
844 }
845 }
846
847 /* Inverse FFT. r must be the same as in the call to FFT. Result is
848 by 2^l too large. */
849 static void ifft(ZZVec& a, long r, long l, const ZZ& p, long n)
850 {
851 long round;
852 long off, i, j, e;
853 long halfsize;
854 ZZ tmp, tmp1;
855 ZZ scratch;
856
857 for (round = l - 1, r <<= l - 1; round >= 0; round--, r >>= 1) {
858 halfsize = 1L << (l - 1 - round);
859 for (i = (1L << round) - 1, off = 0; i >= 0; i--, off += halfsize) {
860 for (j = 0, e = 0; j < halfsize; j++, off++, e+=r) {
861 /* One inverse butterfly :
862 ( a[off], a[off+halfsize] ) *= ( 1 1 )
863 ( w^{-j2^round} -w^{-j2^round} ) */
864 /* a[off + halfsize] *= w^{-j2^round} mod p */
865 Rotate(a[off + halfsize], a[off + halfsize], -e, p, n, scratch);
866 /* tmp = a[off] - a[off + halfsize] */
867 sub(tmp, a[off], a[off + halfsize]);
868
869 /* a[off] += a[off + halfsize] mod p */
870 add(a[off], a[off], a[off + halfsize]);
871 sub(tmp1, a[off], p);
872 if (sign(tmp1) >= 0) {
873 a[off] = tmp1;
874 }
875 /* a[off+halfsize] = tmp mod p */
876 if (sign(tmp) < 0) {
877 add(a[off+halfsize], tmp, p);
878 } else {
879 a[off+halfsize] = tmp;
880 }
881 }
1004
1005
1006 // divide-and-conquer algorithm
1007
1008 long half = N >> 1;
1009
1010 if (yn <= half)
1011 {
1012 if (xn <= half)
1013 {
1014 fft_short(xp, yn, xn, lgN-1, r, l, p, n, tmp);
1015 }
1016 else
1017 {
1018 xn -= half;
1019
1020 // (X, Y) -> X + Y
1021 for (long j = 0; j < xn; j++)
1022 SS_AddMod(xp[j], xp[j], xp[j + half], p, n);
1023
1024 fft_short(xp, yn, half, lgN-1, r, l, p, n, tmp);
1025 }
8821026 }
883 }
884 }
1027 else
1028 {
1029 yn -= half;
1030
1031 ZZ *xp0 = xp;
1032 ZZ *xp1 = xp + half;
1033
1034 if (xn <= half)
1035 {
1036 // X -> (X, w*X)
1037 for (long j = 0; j < xn; j++)
1038 Rotate(xp1[j], xp0[j], j, lgN, r, l, p, n, tmp);
1039
1040 fft_short(xp0, half, xn, lgN-1, r, l, p, n, tmp);
1041 fft_short(xp1, yn, xn, lgN-1, r, l, p, n, tmp);
1042 }
1043 else
1044 {
1045 xn -= half;
1046
1047 // (X, Y) -> (X + Y, w*(X - Y))
1048 for (long j = 0; j < xn; j++)
1049 SS_fwd_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
1050
1051 // X -> (X, w*X)
1052 for (long j = xn; j < half; j++)
1053 Rotate(xp1[j], xp0[j], j, lgN, r, l, p, n, tmp);
1054
1055 fft_short(xp0, half, half, lgN-1, r, l, p, n, tmp);
1056 fft_short(xp1, yn, half, lgN-1, r, l, p, n, tmp);
1057 }
1058 }
1059 }
1060
1061
1062
1063 static void
1064 fft(ZZVec& a, long r, long l, const ZZ& p, long n)
1065 {
1066 ZZ tmp[SS_NTEMPS];
1067 fft_rec(&a[0], l, r, l, p, n, &tmp[0]);
1068 }
1069
1070 static void
1071 fft1(ZZVec& a, long r, long l, long l1, const ZZ& p, long n)
1072 {
1073 ZZ tmp[SS_NTEMPS];
1074 fft_rec(&a[0], l, r, l1, p, n, &tmp[0]);
1075 }
1076
1077 static void
1078 fft_trunc(ZZVec& a, long yn, long xn,
1079 long r, long l, long l1, const ZZ& p, long n)
1080 {
1081 ZZ tmp[SS_NTEMPS];
1082 fft_short(&a[0], yn, xn, l, r, l1, p, n, &tmp[0]);
1083 }
1084
1085 static void
1086 ifft_layer(ZZ* xp, long blocks, long size, long level, long r, long l,
1087 const ZZ& p, long n, ZZ* tmp)
1088 {
1089 size /= 2;
1090
1091 do {
1092 ZZ *xp0 = xp;
1093 ZZ *xp1 = xp + size;
1094
1095 SS_butterfly(xp0[0], xp1[0], p, n, tmp);
1096 for (long j = 1; j < size; j++)
1097 SS_inv_butterfly(xp0[j], xp1[j], j, level, r, l, p, n, tmp);
1098
1099 xp += 2*size;
1100 } while (--blocks != 0);
1101 }
1102
1103 static void
1104 ifft_base(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
1105 ZZ* tmp)
1106 {
1107 long N = 1L << lgN;
1108
1109 for (long j = 1, size = 2, blocks = N/2;
1110 j <= lgN; j++, blocks >>= 1, size <<= 1)
1111 ifft_layer(xp, blocks, size, j, r, l, p, n, tmp);
1112 }
1113
1114
1115 static void
1116 ifft_rec(ZZ* xp, long lgN, long r, long l, const ZZ& p, long n,
1117 ZZ* tmp)
1118 {
1119 if (lgN <= SS_FFT_THRESH) {
1120 ifft_base(xp, lgN, r, l, p, n, tmp);
1121 return;
1122 }
1123
1124 long N = 1L << lgN;
1125 long half = N >> 1;
1126
1127 ZZ *xp0 = xp;
1128 ZZ *xp1 = xp + half;
1129
1130 ifft_rec(xp0, lgN-1, r, l, p, n, tmp);
1131 ifft_rec(xp1, lgN-1, r, l, p, n, tmp);
1132
1133 SS_butterfly(xp0[0], xp1[0], p, n, tmp);
1134 for (long j = 1; j < half; j++)
1135 SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
1136 }
1137
1138 static void
1139 ifft_short2(ZZ* xp, long yn, long lgN,
1140 long r, long l, const ZZ& p, long n, ZZ* tmp);
1141
1142 static void
1143 ifft_short1(ZZ* xp, long yn, long lgN,
1144 long r, long l, const ZZ& p, long n, ZZ* tmp)
1145
1146 {
1147 long N = 1L << lgN;
1148
1149 if (yn == N && lgN <= SS_FFT_THRESH)
1150 {
1151 // no truncation
1152 ifft_base(xp, lgN, r, l, p, n, tmp);
1153 return;
1154 }
1155
1156 // divide-and-conquer algorithm
1157
1158 long half = N >> 1;
1159
1160 if (yn <= half)
1161 {
1162 // X -> 2X
1163 for (long j = 0; j < yn; j++)
1164 SS_AddMod(xp[j], xp[j], xp[j], p, n);
1165
1166 ifft_short1(xp, yn, lgN-1, r, l, p, n, tmp);
1167 }
1168 else
1169 {
1170 ZZ *xp0 = xp;
1171 ZZ *xp1 = xp + half;
1172
1173 ifft_short1(xp0, half, lgN-1, r, l, p, n, tmp);
1174
1175 yn -= half;
1176
1177 // X -> (2X, w*X)
1178 for (long j = yn; j < half; j++)
1179 {
1180 tmp[0] = xp0[j];
1181 SS_AddMod(xp0[j], xp0[j], xp0[j], p, n);
1182 Rotate(xp1[j], tmp[0], j, lgN, r, l, p, n, tmp+1);
1183 }
1184
1185 ifft_short2(xp1, yn, lgN-1, r, l, p, n, tmp);
1186
1187 // (X, Y) -> (X + Y/w, X - Y/w)
1188 SS_butterfly(xp0[0], xp1[0], p, n, tmp);
1189 for (long j = 1; j < yn; j++)
1190 SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
1191 }
1192 }
1193
1194
1195 static void
1196 ifft_short2(ZZ* xp, long yn, long lgN,
1197 long r, long l, const ZZ& p, long n, ZZ* tmp)
1198
1199 {
1200 long N = 1L << lgN;
1201
1202 if (yn == N && lgN <= SS_FFT_THRESH)
1203 {
1204 // no truncation
1205 ifft_base(xp, lgN, r, l, p, n, tmp);
1206 return;
1207 }
1208
1209 // divide-and-conquer algorithm
1210
1211 long half = N >> 1;
1212
1213 if (yn <= half)
1214 {
1215 // X -> 2X
1216 for (long j = 0; j < yn; j++)
1217 SS_AddMod(xp[j], xp[j], xp[j], p, n);
1218
1219 // (X, Y) -> X + Y
1220 for (long j = yn; j < half; j++)
1221 SS_AddMod(xp[j], xp[j], xp[j + half], p, n);
1222
1223 ifft_short2(xp, yn, lgN-1, r, l, p, n, tmp);
1224
1225 // (X, Y) -> X - Y
1226 for (long j = 0; j < yn; j++)
1227 SS_SubMod(xp[j], xp[j], xp[j + half], p, n);
1228 }
1229 else
1230 {
1231 ZZ *xp0 = xp;
1232 ZZ *xp1 = xp + half;
1233
1234 ifft_short1(xp0, half, lgN-1, r, l, p, n, tmp);
1235
1236 yn -= half;
1237
1238 // (X, Y) -> (2X - Y, w*(X - Y))
1239 for (long j = yn; j < half; j++)
1240 {
1241 SS_SubMod(tmp[0], xp0[j], xp1[j], p, n);
1242 SS_AddMod(xp0[j], xp0[j], tmp[0], p, n);
1243 Rotate(xp1[j], tmp[0], j, lgN, r, l, p, n, tmp+1);
1244 }
1245
1246
1247 ifft_short2(xp1, yn, lgN-1, r, l, p, n, tmp);
1248
1249 // (X, Y) -> (X + Y/w, X - Y/w)
1250 SS_butterfly(xp0[0], xp1[0], p, n, tmp);
1251 for (long j = 1; j < yn; j++)
1252 SS_inv_butterfly(xp0[j], xp1[j], j, lgN, r, l, p, n, tmp);
1253 }
1254 }
1255
1256
1257 static void
1258 ifft(ZZVec& a, long r, long l, const ZZ& p, long n)
1259 {
1260 ZZ tmp[SS_NTEMPS];
1261 ifft_rec(&a[0], l, r, l, p, n, &tmp[0]);
1262 }
1263
1264 static void
1265 ifft1(ZZVec& a, long r, long l, long l1, const ZZ& p, long n)
1266 {
1267 ZZ tmp[SS_NTEMPS];
1268 ifft_rec(&a[0], l, r, l1, p, n, &tmp[0]);
1269 }
1270
1271 static void
1272 ifft_trunc(ZZVec& a, long yn, long r, long l, long l1, const ZZ& p, long n)
1273 {
1274 ZZ tmp[SS_NTEMPS];
1275 ifft_short1(&a[0], yn, l, r, l1, p, n, &tmp[0]);
1276 }
1277
1278
8851279
8861280
8871281
8931287 absolute value. The algorithm is not called recursively;
8941288 coefficient arithmetic is done directly.*/
8951289
1290 // The original version of SSMUl was written by Juergen Gerhard.
1291 // However, it has been almost completely re-written so as
1292 // to provide the following improvements:
1293 // * uses truncated FFT and Inverse FFT algorithms,
1294 // for better performance between powers of 2
1295 // * better cache locality because of divide and conquer structure
1296 // * better performance because of sqrt(2) trick
1297
8961298 void SSMul(ZZX& c, const ZZX& a, const ZZX& b)
8971299 {
8981300 if (&a == &b) {
9131315
9141316 /* Choose m and r suitably */
9151317 long l = NextPowerOfTwo(n + 1) - 1; /* 2^l <= n < 2^{l+1} */
916 long m2 = 1L << (l + 1); /* m2 = 2m = 2^{l+1} */
1318 long N = 1L << (l + 1); /* N = 2^{l+1} */
9171319 /* Bitlength of the product: if the coefficients of a are absolutely less
9181320 than 2^ka and the coefficients of b are absolutely less than 2^kb, then
9191321 the coefficients of ab are absolutely less than
9231325 long r = (bound >> l) + 1;
9241326 long mr = r << l;
9251327
1328 // sqrt(2) trick
1329 long l1 = l;
1330 if (l1 >= 3) {
1331 long alt_l1 = l-1;
1332 long alt_r = (bound >> alt_l1) + 1;
1333 long alt_mr = alt_r << alt_l1;
1334
1335 if (alt_mr < mr - mr/8) {
1336 l1 = alt_l1;
1337 r = alt_r;
1338 mr = alt_mr;
1339 }
1340 }
1341
9261342 /* p := 2^{mr}+1 */
9271343 ZZ p;
9281344 set(p);
9311347
9321348 /* Make coefficients of a and b positive */
9331349 ZZVec aa, bb;
934 aa.SetSize(m2, p.size());
935 bb.SetSize(m2, p.size());
1350 aa.SetSize(N, p.size());
1351 bb.SetSize(N, p.size());
9361352
9371353 for (long i = 0; i <= deg(a); i++) {
9381354 if (sign(a.rep[i]) >= 0) {
9501366 }
9511367 }
9521368
953
954 /* 2m-point FFT's mod p */
955 fft(aa, r, l + 1, p, mr);
956 fft(bb, r, l + 1, p, mr);
1369 long yn = SS_FFTRoundUp(n+1, l+1);
1370
1371 /* N-point FFT's mod p */
1372 fft_trunc(aa, yn, SS_FFTRoundUp(na+1, l+1), r, l+1, l1+1, p, mr);
1373 fft_trunc(bb, yn, SS_FFTRoundUp(nb+1, l+1), r, l+1, l1+1, p, mr);
9571374
9581375
9591376 /* Pointwise multiplication aa := aa * bb mod p */
9601377 // NOTE: we attempt to parallelize this
9611378 // Unfortunately, the bulk of the time is spent
9621379 // in the FFT, so this is not very effective
963 NTL_EXEC_RANGE(m2, first, last)
1380 NTL_EXEC_RANGE(yn, first, last)
9641381 ZZ tmp, ai;
9651382 for (long i = first; i < last; i++) {
9661383 mul(ai, aa[i], bb[i]);
9761393 }
9771394 NTL_EXEC_RANGE_END
9781395
979 ifft(aa, r, l + 1, p, mr);
980
981 /* Retrieve c, dividing by 2m, and subtracting p where necessary */
1396 ifft_trunc(aa, yn, r, l+1, l1+1, p, mr);
1397
1398 /* Retrieve c, dividing by N, and subtracting p where necessary */
9821399 c.rep.SetLength(n + 1);
9831400 ZZ ai, tmp, scratch;
9841401 for (long i = 0; i <= n; i++) {
9851402 ai = aa[i];
9861403 ZZ& ci = c.rep[i];
9871404 if (!IsZero(ai)) {
988 /* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / 2m mod p */
1405 /* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / N mod p */
9891406 LeftRotate(ai, ai, mr - l - 1, p, mr, scratch);
9901407 sub(tmp, p, ai);
9911408 if (NumBits(tmp) >= mr) { /* ci >= (p-1)/2 */
10161433 long r = (bound >> l) + 1;
10171434 long mr = r << l;
10181435
1436 // sqrt(2) trick
1437 long l1 = l;
1438 if (l1 >= 3) {
1439 long alt_l1 = l-1;
1440 long alt_r = (bound >> alt_l1) + 1;
1441 long alt_mr = alt_r << alt_l1;
1442
1443 if (alt_mr < mr - mr/8) {
1444 l1 = alt_l1;
1445 r = alt_r;
1446 mr = alt_mr;
1447 }
1448 }
1449
10191450 return double(mr + 1)/double(bound);
10201451 }
10211452
10431474
10441475 if (nt == 1) {
10451476
1046 return (k >= 26 && rat < 1.20) ||
1477 return (k >= 13 && rat < 1.15) ||
1478 (k >= 26 && rat < 1.30) ||
10471479 (k >= 53 && rat < 1.60) ||
10481480 (k >= 106 && rat < 1.80) ||
10491481 (k >= 212 && rat < 2.00);
11251557 }
11261558 }
11271559
1128
1129
11301560 void SSSqr(ZZX& c, const ZZX& a)
1561
11311562 {
11321563 long na = deg(a);
1564
11331565 if (na <= 0) {
11341566 PlainSqr(c, a);
11351567 return;
11381570 long n = na + na; /* degree of the product */
11391571
11401572
1573 /* Choose m and r suitably */
11411574 long l = NextPowerOfTwo(n + 1) - 1; /* 2^l <= n < 2^{l+1} */
1142 long m2 = 1L << (l + 1); /* m2 = 2m = 2^{l+1} */
1575 long N = 1L << (l + 1); /* N = 2^{l+1} */
11431576 long bound = 2 + NumBits(na) + 2*MaxBits(a);
1577 /* Let r be minimal so that mr > bound */
11441578 long r = (bound >> l) + 1;
11451579 long mr = r << l;
1580
1581 // sqrt(2) trick
1582 long l1 = l;
1583 if (l1 >= 3) {
1584 long alt_l1 = l-1;
1585 long alt_r = (bound >> alt_l1) + 1;
1586 long alt_mr = alt_r << alt_l1;
1587
1588 if (alt_mr < mr - mr/8) {
1589 l1 = alt_l1;
1590 r = alt_r;
1591 mr = alt_mr;
1592 }
1593 }
11461594
11471595 /* p := 2^{mr}+1 */
11481596 ZZ p;
11501598 LeftShift(p, p, mr);
11511599 add(p, p, 1);
11521600
1601 /* Make coefficients of a and b positive */
11531602 ZZVec aa;
1154 aa.SetSize(m2, p.size());
1603 aa.SetSize(N, p.size());
11551604
11561605 for (long i = 0; i <= deg(a); i++) {
11571606 if (sign(a.rep[i]) >= 0) {
11611610 }
11621611 }
11631612
1164
1165 /* 2m-point FFT's mod p */
1166 fft(aa, r, l + 1, p, mr);
1167
1168 /* Pointwise multiplication aa := aa * aa mod p */
1613 long yn = SS_FFTRoundUp(n+1, l+1);
1614
1615 /* N-point FFT's mod p */
1616 fft_trunc(aa, yn, SS_FFTRoundUp(na+1, l+1), r, l+1, l1+1, p, mr);
1617
1618
1619 /* Pointwise multiplication aa := aa * bb mod p */
11691620 // NOTE: we attempt to parallelize this
11701621 // Unfortunately, the bulk of the time is spent
11711622 // in the FFT, so this is not very effective
1172 NTL_EXEC_RANGE(m2, first, last)
1623 NTL_EXEC_RANGE(yn, first, last)
11731624 ZZ tmp, ai;
11741625 for (long i = first; i < last; i++) {
11751626 sqr(ai, aa[i]);
11841635 aa[i] = ai;
11851636 }
11861637 NTL_EXEC_RANGE_END
1187
1188 ifft(aa, r, l + 1, p, mr);
1189
1190
1191 /* Retrieve c, dividing by 2m, and subtracting p where necessary */
1638
1639 ifft_trunc(aa, yn, r, l+1, l1+1, p, mr);
1640
1641 /* Retrieve c, dividing by N, and subtracting p where necessary */
11921642 c.rep.SetLength(n + 1);
11931643 ZZ ai, tmp, scratch;
11941644 for (long i = 0; i <= n; i++) {
11951645 ai = aa[i];
11961646 ZZ& ci = c.rep[i];
11971647 if (!IsZero(ai)) {
1198 /* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / 2m mod p */
1648 /* ci = -ai * 2^{mr-l-1} = ai * 2^{-l-1} = ai / N mod p */
11991649 LeftRotate(ai, ai, mr - l - 1, p, mr, scratch);
12001650 sub(tmp, p, ai);
12011651 if (NumBits(tmp) >= mr) { /* ci >= (p-1)/2 */
12081658 clear(ci);
12091659 }
12101660 }
1661
1662
12111663
12121664 void sqr(ZZX& c, const ZZX& a)
12131665 {
184184 ptr = ZZ_pInfo_stg;
185185 }
186186
187
188187 void ZZ_pContext::restore() const
189188 {
190189 if (ZZ_pInfo == ptr.get()) return;
18871887 }
18881888 }
18891889
1890 void GCD(ZZ_pEX& x, const ZZ_pEX& a, const ZZ_pEX& b)
1890 void PlainGCD(ZZ_pEX& x, const ZZ_pEX& a, const ZZ_pEX& b)
18911891 {
18921892 ZZ_pE t;
18931893
19221922 mul(x, x, t);
19231923 }
19241924
1925
1926
1927
1925 class _NTL_ZZ_pEXMatrix {
1926 private:
1927
1928 _NTL_ZZ_pEXMatrix(const _NTL_ZZ_pEXMatrix&); // disable
1929 ZZ_pEX elts[2][2];
1930
1931 public:
1932
1933 _NTL_ZZ_pEXMatrix() { }
1934 ~_NTL_ZZ_pEXMatrix() { }
1935
1936 void operator=(const _NTL_ZZ_pEXMatrix&);
1937 ZZ_pEX& operator() (long i, long j) { return elts[i][j]; }
1938 const ZZ_pEX& operator() (long i, long j) const { return elts[i][j]; }
1939 };
1940
1941
1942 void _NTL_ZZ_pEXMatrix::operator=(const _NTL_ZZ_pEXMatrix& M)
1943 {
1944 elts[0][0] = M.elts[0][0];
1945 elts[0][1] = M.elts[0][1];
1946 elts[1][0] = M.elts[1][0];
1947 elts[1][1] = M.elts[1][1];
1948 }
1949
1950
1951 static
1952 void mul(ZZ_pEX& U, ZZ_pEX& V, const _NTL_ZZ_pEXMatrix& M)
1953 // (U, V)^T = M*(U, V)^T
1954 {
1955 ZZ_pEX t1, t2, t3;
1956
1957 mul(t1, M(0,0), U);
1958 mul(t2, M(0,1), V);
1959 add(t3, t1, t2);
1960 mul(t1, M(1,0), U);
1961 mul(t2, M(1,1), V);
1962 add(V, t1, t2);
1963 U = t3;
1964 }
1965
1966
1967 static
1968 void mul(_NTL_ZZ_pEXMatrix& A, _NTL_ZZ_pEXMatrix& B, _NTL_ZZ_pEXMatrix& C)
1969 // A = B*C, B and C are destroyed
1970 {
1971 ZZ_pEX t1, t2;
1972
1973 mul(t1, B(0,0), C(0,0));
1974 mul(t2, B(0,1), C(1,0));
1975 add(A(0,0), t1, t2);
1976
1977 mul(t1, B(1,0), C(0,0));
1978 mul(t2, B(1,1), C(1,0));
1979 add(A(1,0), t1, t2);
1980
1981 mul(t1, B(0,0), C(0,1));
1982 mul(t2, B(0,1), C(1,1));
1983 add(A(0,1), t1, t2);
1984
1985 mul(t1, B(1,0), C(0,1));
1986 mul(t2, B(1,1), C(1,1));
1987 add(A(1,1), t1, t2);
1988
1989 long i, j;
1990 for (i = 0; i < 2; i++) {
1991 for (j = 0; j < 2; j++) {
1992 B(i,j).kill();
1993 C(i,j).kill();
1994 }
1995 }
1996 }
1997
1998
1999 void IterHalfGCD(_NTL_ZZ_pEXMatrix& M_out, ZZ_pEX& U, ZZ_pEX& V, long d_red)
2000 {
2001 M_out(0,0).SetMaxLength(d_red);
2002 M_out(0,1).SetMaxLength(d_red);
2003 M_out(1,0).SetMaxLength(d_red);
2004 M_out(1,1).SetMaxLength(d_red);
2005
2006 set(M_out(0,0)); clear(M_out(0,1));
2007 clear(M_out(1,0)); set(M_out(1,1));
2008
2009 long goal = deg(U) - d_red;
2010
2011 if (deg(V) <= goal)
2012 return;
2013
2014 ZZ_pEX Q, t(INIT_SIZE, d_red);
2015
2016 while (deg(V) > goal) {
2017 PlainDivRem(Q, U, U, V);
2018 swap(U, V);
2019
2020 mul(t, Q, M_out(1,0));
2021 sub(t, M_out(0,0), t);
2022 M_out(0,0) = M_out(1,0);
2023 M_out(1,0) = t;
2024
2025 mul(t, Q, M_out(1,1));
2026 sub(t, M_out(0,1), t);
2027 M_out(0,1) = M_out(1,1);
2028 M_out(1,1) = t;
2029 }
2030 }
2031
2032
2033
2034 #define NTL_ZZ_pEX_HalfGCD_CROSSOVER (25)
2035 #define NTL_ZZ_pEX_GCD_CROSSOVER (275)
2036
2037
2038 void HalfGCD(_NTL_ZZ_pEXMatrix& M_out, const ZZ_pEX& U, const ZZ_pEX& V, long d_red)
2039 {
2040 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2041 set(M_out(0,0)); clear(M_out(0,1));
2042 clear(M_out(1,0)); set(M_out(1,1));
2043
2044 return;
2045 }
2046
2047
2048 long n = deg(U) - 2*d_red + 2;
2049 if (n < 0) n = 0;
2050
2051 ZZ_pEX U1, V1;
2052
2053 RightShift(U1, U, n);
2054 RightShift(V1, V, n);
2055
2056 if (d_red <= NTL_ZZ_pEX_HalfGCD_CROSSOVER) {
2057 IterHalfGCD(M_out, U1, V1, d_red);
2058 return;
2059 }
2060
2061 long d1 = (d_red + 1)/2;
2062 if (d1 < 1) d1 = 1;
2063 if (d1 >= d_red) d1 = d_red - 1;
2064
2065 _NTL_ZZ_pEXMatrix M1;
2066
2067 HalfGCD(M1, U1, V1, d1);
2068 mul(U1, V1, M1);
2069
2070 long d2 = deg(V1) - deg(U) + n + d_red;
2071
2072 if (IsZero(V1) || d2 <= 0) {
2073 M_out = M1;
2074 return;
2075 }
2076
2077
2078 ZZ_pEX Q;
2079 _NTL_ZZ_pEXMatrix M2;
2080
2081 DivRem(Q, U1, U1, V1);
2082 swap(U1, V1);
2083
2084 HalfGCD(M2, U1, V1, d2);
2085
2086 ZZ_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
2087
2088 mul(t, Q, M1(1,0));
2089 sub(t, M1(0,0), t);
2090 swap(M1(0,0), M1(1,0));
2091 swap(M1(1,0), t);
2092
2093 t.kill();
2094
2095 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
2096
2097 mul(t, Q, M1(1,1));
2098 sub(t, M1(0,1), t);
2099 swap(M1(0,1), M1(1,1));
2100 swap(M1(1,1), t);
2101
2102 t.kill();
2103
2104 mul(M_out, M2, M1);
2105 }
2106
2107
2108
2109
2110 void XHalfGCD(_NTL_ZZ_pEXMatrix& M_out, ZZ_pEX& U, ZZ_pEX& V, long d_red)
2111 {
2112 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2113 set(M_out(0,0)); clear(M_out(0,1));
2114 clear(M_out(1,0)); set(M_out(1,1));
2115
2116 return;
2117 }
2118
2119 long du = deg(U);
2120
2121 if (d_red <= NTL_ZZ_pEX_HalfGCD_CROSSOVER) {
2122 IterHalfGCD(M_out, U, V, d_red);
2123 return;
2124 }
2125
2126 long d1 = (d_red + 1)/2;
2127 if (d1 < 1) d1 = 1;
2128 if (d1 >= d_red) d1 = d_red - 1;
2129
2130 //ZZ_pXMatrix M1;
2131 _NTL_ZZ_pEXMatrix M1;
2132
2133 HalfGCD(M1, U, V, d1);
2134 mul(U, V, M1);
2135
2136 long d2 = deg(V) - du + d_red;
2137
2138 if (IsZero(V) || d2 <= 0) {
2139 M_out = M1;
2140 return;
2141 }
2142
2143
2144 ZZ_pEX Q;
2145 _NTL_ZZ_pEXMatrix M2;
2146
2147 DivRem(Q, U, U, V);
2148 swap(U, V);
2149
2150 XHalfGCD(M2, U, V, d2);
2151
2152 ZZ_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
2153
2154 mul(t, Q, M1(1,0));
2155 sub(t, M1(0,0), t);
2156 swap(M1(0,0), M1(1,0));
2157 swap(M1(1,0), t);
2158
2159 t.kill();
2160
2161 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
2162
2163 mul(t, Q, M1(1,1));
2164 sub(t, M1(0,1), t);
2165 swap(M1(0,1), M1(1,1));
2166 swap(M1(1,1), t);
2167
2168 t.kill();
2169
2170 mul(M_out, M2, M1);
2171 }
2172
2173 void HalfGCD(ZZ_pEX& U, ZZ_pEX& V)
2174 {
2175 long d_red = (deg(U)+1)/2;
2176
2177 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2178 return;
2179 }
2180
2181 long du = deg(U);
2182
2183
2184 long d1 = (d_red + 1)/2;
2185 if (d1 < 1) d1 = 1;
2186 if (d1 >= d_red) d1 = d_red - 1;
2187
2188 _NTL_ZZ_pEXMatrix M1;
2189
2190 HalfGCD(M1, U, V, d1);
2191 mul(U, V, M1);
2192
2193 long d2 = deg(V) - du + d_red;
2194
2195 if (IsZero(V) || d2 <= 0) {
2196 return;
2197 }
2198
2199 M1(0,0).kill();
2200 M1(0,1).kill();
2201 M1(1,0).kill();
2202 M1(1,1).kill();
2203
2204
2205 ZZ_pEX Q;
2206
2207 DivRem(Q, U, U, V);
2208 swap(U, V);
2209
2210 HalfGCD(M1, U, V, d2);
2211
2212 mul(U, V, M1);
2213 }
2214
2215
2216 void GCD(ZZ_pEX& d, const ZZ_pEX& u, const ZZ_pEX& v)
2217 {
2218 ZZ_pEX u1, v1;
2219
2220 u1 = u;
2221 v1 = v;
2222
2223 if (deg(u1) == deg(v1)) {
2224 if (IsZero(u1)) {
2225 clear(d);
2226 return;
2227 }
2228
2229 rem(v1, v1, u1);
2230 }
2231 else if (deg(u1) < deg(v1)) {
2232 swap(u1, v1);
2233 }
2234
2235 // deg(u1) > deg(v1)
2236
2237 while (deg(u1) > NTL_ZZ_pEX_GCD_CROSSOVER && !IsZero(v1)) {
2238 HalfGCD(u1, v1);
2239
2240 if (!IsZero(v1)) {
2241 rem(u1, u1, v1);
2242 swap(u1, v1);
2243 }
2244 }
2245
2246 PlainGCD(d, u1, v1);
2247 }
2248
19282249
19292250 void XGCD(ZZ_pEX& d, ZZ_pEX& s, ZZ_pEX& t, const ZZ_pEX& a, const ZZ_pEX& b)
19302251 {
1931 ZZ_pE z;
1932
1933
1934 if (IsZero(b)) {
2252 ZZ_pE w;
2253
2254 if (IsZero(a) && IsZero(b)) {
2255 clear(d);
19352256 set(s);
19362257 clear(t);
1937 d = a;
1938 }
1939 else if (IsZero(a)) {
1940 clear(s);
1941 set(t);
1942 d = b;
1943 }
1944 else {
1945 long e = max(deg(a), deg(b)) + 1;
1946
1947 ZZ_pEX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1948 u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1949 u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1950 u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1951
1952
1953 set(u1); clear(v1);
1954 clear(u2); set(v2);
1955 u = a; v = b;
1956
1957 do {
1958 DivRem(q, u, u, v);
1959 swap(u, v);
1960 u0 = u2;
1961 v0 = v2;
1962 mul(temp, q, u2);
1963 sub(u2, u1, temp);
1964 mul(temp, q, v2);
1965 sub(v2, v1, temp);
1966 u1 = u0;
1967 v1 = v0;
1968 } while (!IsZero(v));
1969
1970 d = u;
1971 s = u1;
1972 t = v1;
1973 }
1974
1975 if (IsZero(d)) return;
1976 if (IsOne(LeadCoeff(d))) return;
1977
1978 /* make gcd monic */
1979
1980 inv(z, LeadCoeff(d));
1981 mul(d, d, z);
1982 mul(s, s, z);
1983 mul(t, t, z);
1984 }
2258 return;
2259 }
2260
2261 ZZ_pEX U, V, Q;
2262
2263 U = a;
2264 V = b;
2265
2266 long flag = 0;
2267
2268 if (deg(U) == deg(V)) {
2269 DivRem(Q, U, U, V);
2270 swap(U, V);
2271 flag = 1;
2272 }
2273 else if (deg(U) < deg(V)) {
2274 swap(U, V);
2275 flag = 2;
2276 }
2277
2278 _NTL_ZZ_pEXMatrix M;
2279
2280 XHalfGCD(M, U, V, deg(U)+1);
2281
2282 d = U;
2283
2284 if (flag == 0) {
2285 s = M(0,0);
2286 t = M(0,1);
2287 }
2288 else if (flag == 1) {
2289 s = M(0,1);
2290 mul(t, Q, M(0,1));
2291 sub(t, M(0,0), t);
2292 }
2293 else { /* flag == 2 */
2294 s = M(0,1);
2295 t = M(0,0);
2296 }
2297
2298 // normalize
2299
2300 inv(w, LeadCoeff(d));
2301 mul(d, d, w);
2302 mul(s, s, w);
2303 mul(t, t, w);
2304 }
2305
19852306
19862307 void IterBuild(ZZ_pE* a, long n)
19872308 {
0 #include <NTL/ZZ_pXFactoring.h>
1 #include <NTL/ZZ_pEX.h>
2
3 NTL_CLIENT
4
5
6
7 void test(ZZ_pX& P, ZZ_pEX& f, ZZ_pEX& g, ZZ_pEX& h, ZZ_pEX& hx, ZZ_pEX& s, ZZ_pEX& t)
8 {
9 /* P is the polynomial of the extension
10 * f and g the polynomials
11 * h the gcd
12 * hx the gcd obtained using XGCD
13 * s, t are Bezout coefficients hx=f*s+g*t
14 */
15 ZZ_pEX htest,rf,rg;
16
17 if (h!=hx){
18 cout << P << "\n" << f << "\n" << g << "\n";
19 Error("different gcd:\n");
20 }
21
22 if (max(deg(f), deg(g)) > 0 || min(deg(f), deg(g)) >= 0) {
23 if (deg(s) >= deg(g) || deg(t) >= deg(f)) {
24 cout << P << "\n" << f << "\n" << g << "\n";
25 Error("degree bounds at fault:\n");
26 }
27 }
28
29
30 mul(s,s,f);
31 mul(t,t,g);
32 add(htest,t,s);
33 if (h!=htest){
34 cout << P << "\n" << f << "\n" << g << "\n";
35 Error("xgcd at fault:\n");
36 }
37 if (!IsZero(h)){
38 rem(rf,f,h);
39 rem(rg,f,h);
40 if ((!IsZero(rf))||(!IsZero(rg))){
41 cout << P << "\n" << f << "\n" << g << "\n";
42 Error("not a common divisor\n");
43 }
44 }else{
45 if (!IsZero(f) && !IsZero(g)){
46 cout << "debug:\n";
47 cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
48 Error("ooops:\n");
49 }
50 }
51 }
52
53
54 int main()
55 {
56
57 ZZ prime = conv<ZZ>("340282366920938463463374607619092730237");
58
59 ZZ_p::init(prime);
60
61 ZZ_pX P;
62
63 BuildIrred(P, 3);
64
65 ZZ_pE::init(P);
66
67 for (long i = 0; i < 400; i++) {
68 if (i%10 == 0) cerr << ".";
69 ZZ_pEX f,g,h,s,t,hx;
70
71 long deg_h;
72 if (RandomBnd(2))
73 deg_h = RandomBnd(10)+1;
74 else
75 deg_h = RandomBnd(500)+1;
76
77 random(h, deg_h);
78 SetCoeff(h, deg_h);
79
80 long deg_f;
81 if (RandomBnd(2))
82 deg_f = RandomBnd(10)+1;
83 else
84 deg_f = RandomBnd(1000)+1;
85
86 random(f, deg_f);
87 f *= h;
88
89 long deg_g;
90 if (RandomBnd(2))
91 deg_g = RandomBnd(10)+1;
92 else
93 deg_g = RandomBnd(1000)+1;
94
95 random(g, deg_g);
96 g *= h;
97
98 h = 0;
99
100 GCD(h, f, g);
101 XGCD(hx, s, t, f, g);
102 test(P, f, g, h, hx, s, t);
103 }
104
105 cerr << "\n";
106
107 }
00 #include <NTL/ZZ_pX.h>
11 #include <NTL/BasicThreadPool.h>
2 #include <NTL/FFT_impl.h>
23
34
45 // The mul & sqr routines use routines from ZZX,
1213
1314 #endif
1415
16 NTL_START_IMPL
1517
1618
1719 #if (defined(NTL_GMP_LIP))
2022 #define KARX 80
2123 #endif
2224
23
24
25 NTL_START_IMPL
26
25 #define PAR_THRESH (4000.0)
26
27 #define PAR_THRESH1 (20000.0)
28 // Higher threshold for cheaper operations
29
30 static inline bool BelowThresh(long n)
31 {
32 return double(n)*double(ZZ_p::ModulusSize()) < PAR_THRESH;
33 }
34
35 static inline bool BelowThresh1(long n)
36 {
37 return double(n)*double(ZZ_p::ModulusSize()) < PAR_THRESH1;
38 }
2739
2840
2941
475487
476488 if ( nt == 1 && (
477489
478 (k >= 106 && rat < 1.30) ||
490 (k >= 106 && rat < 1.50) ||
479491 (k >= 212 && rat < 1.75)
480492
481493 )) {
532544
533545 if ( nt == 1 && (
534546
535 (k >= 53 && rat < 1.10) ||
547 (k >= 53 && rat < 1.20) ||
536548 (k >= 106 && rat < 1.30) ||
537549 (k >= 212 && rat < 1.75)
538550
10191031 {
10201032 BasicThreadPool *pool = GetThreadPool();
10211033
1022 if (!pool || pool->active() || pool->NumThreads() == 1) {
1034 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(n)) {
10231035 basic_MulAux(xp, ap, t, n);
10241036 return;
10251037 }
12381250
12391251 BasicThreadPool *pool = GetThreadPool();
12401252
1241 if (!pool || pool->active() || pool->NumThreads() == 1 || hh == aa) {
1253 if (!pool || pool->active() || pool->NumThreads() == 1 || hh == aa || BelowThresh(n)) {
12421254 // Careful! can't parallelize if hh == aa
12431255 basic_MulByXModAux1(n, hh, aa, ff, z);
12441256 return;
13821394
13831395 if (R.k < 0) {
13841396 k = -1;
1397 len = 0;
13851398 return *this;
13861399 }
13871400
13881401 DoSetSize(R.k, R.NumPrimes);
1389 long i, j, n;
1390
1391 n = 1L << k;
1402 len = R.len;
1403
1404 long i, j;
13921405
13931406 for (i = 0; i < NumPrimes; i++)
1394 for (j = 0; j < n; j++)
1407 for (j = 0; j < len; j++)
13951408 tbl[i][j] = R.tbl[i][j];
13961409
13971410 return *this;
14821495
14831496
14841497
1485 NTL_TBDECL(ToFFTRep)(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
1498 NTL_TBDECL(ToFFTRep_trunc)(FFTRep& y, const ZZ_pX& x, long k, long len, long lo, long hi)
14861499 // computes an n = 2^k point convolution.
14871500 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
14881501 {
15061519 hi = min(hi, deg(x));
15071520
15081521 y.SetSize(k);
1509
15101522 n = 1L << k;
15111523
1524 y.len = len = FFTRoundUp(len, k);
1525
15121526 m = max(hi-lo + 1, 0);
1527 long ilen = FFTRoundUp(m, k);
15131528
15141529 const ZZ_p *xx = x.rep.elts();
15151530
15211536 }
15221537 }
15231538
1524 if (n > m) {
1539 if (ilen > m) {
15251540 for (i = 0; i < nprimes; i++) {
15261541 long *yp = &y.tbl[i][0];
1527 for (j = m; j < n; j++) {
1542 for (j = m; j < ilen; j++) {
15281543 yp[j] = 0;
15291544 }
15301545 }
15491564
15501565 for (i = 0; i < nprimes; i++) {
15511566 long *yp = &y.tbl[i][0];
1552 FFTFwd(yp, yp, k, i);
1567 FFTFwd_trunc(yp, yp, k, i, len, ilen);
15531568 }
15541569 }
15551570
15561571
15571572 #ifdef NTL_THREAD_BOOST
15581573
1559 void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
1574 void ToFFTRep_trunc(FFTRep& y, const ZZ_pX& x, long k, long len, long lo, long hi)
15601575 // computes an n = 2^k point convolution.
15611576 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
15621577 {
15631578 BasicThreadPool *pool = GetThreadPool();
15641579
1565 if (!pool || pool->active() || pool->NumThreads() == 1) {
1566 basic_ToFFTRep(y, x, k, lo, hi);
1580 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << k)) {
1581 basic_ToFFTRep_trunc(y, x, k, len, lo, hi);
15671582 return;
15681583 }
15691584
15851600 hi = min(hi, deg(x));
15861601
15871602 y.SetSize(k);
1588
15891603 n = 1L << k;
15901604
1605 y.len = len = FFTRoundUp(len, k);
1606
15911607 m = max(hi-lo + 1, 0);
1608 long ilen = FFTRoundUp(m, k);
15921609
15931610 const ZZ_p *xx = x.rep.elts();
15941611
16451662 // cache performance. I don't really know if that is an issue.
16461663
16471664 pool->exec_range(nprimes,
1648 [&y, m, n, k](long first, long last) {
1665 [&y, m, n, k, len, ilen](long first, long last) {
16491666 for (long i = first; i < last; i++) {
16501667 long *yp = &y.tbl[i][0];
1651 for (long j = m; j < n; j++) yp[j] = 0;
1652 FFTFwd(yp, yp, k, i);
1668 for (long j = m; j < ilen; j++) yp[j] = 0;
1669 FFTFwd_trunc(yp, yp, k, i, len, ilen);
16531670 }
16541671 } );
16551672 }
16861703 y.SetSize(k);
16871704
16881705 n = 1L << k;
1706 y.len = n;
16891707
16901708 m = max(hi-lo + 1, 0);
16911709
17151733
17161734 for (i = 0; i < nprimes; i++) {
17171735 long *yp = &y.tbl[i][0];
1718 FFTRev1(yp, yp, k, i);
1736 FFTRev1_trans(yp, yp, k, i);
17191737 }
17201738
17211739 }
17321750 {
17331751 BasicThreadPool *pool = GetThreadPool();
17341752
1735 if (!pool || pool->active() || pool->NumThreads() == 1) {
1753 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << k)) {
17361754 basic_RevToFFTRep(y, x, k, lo, hi, offset);
17371755 return;
17381756 }
17541772 y.SetSize(k);
17551773
17561774 n = 1L << k;
1775 y.len = n;
17571776
17581777 m = max(hi-lo + 1, 0);
17591778
18031822 [&y, k](long first, long last) {
18041823 for (long i = first; i < last; i++) {
18051824 long *yp = &y.tbl[i][0];
1806 FFTRev1(yp, yp, k, i);
1825 FFTRev1_trans(yp, yp, k, i);
18071826 }
18081827 } );
18091828
18371856 k = y.k;
18381857 n = (1L << k);
18391858
1840
1841 for (i = 0; i < nprimes; i++) {
1842 long *yp = &y.tbl[i][0];
1843 FFTRev1(yp, yp, k, i);
1844 }
1845
18461859 hi = min(hi, n-1);
18471860 l = hi-lo+1;
18481861 l = max(l, 0);
1862
1863 long len = y.len;
1864 if (len <= hi) LogicError("FromFFTRep: bad len");
1865
1866
1867 for (i = 0; i < nprimes; i++) {
1868 long *yp = &y.tbl[i][0];
1869 FFTRev1_trunc(yp, yp, k, i, len);
1870 }
1871
18491872 x.rep.SetLength(l);
1873
18501874
18511875 for (j = 0; j < l; j++) {
18521876 for (i = 0; i < nprimes; i++)
18691893 {
18701894 BasicThreadPool *pool = GetThreadPool();
18711895
1872 if (!pool || pool->active() || pool->NumThreads() == 1) {
1896 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << y.k)) {
18731897 basic_FromFFTRep(x, y, lo, hi);
18741898 return;
18751899 }
18811905
18821906 k = y.k;
18831907 n = (1L << k);
1884
1885
1886 pool->exec_range(nprimes,
1887 [&y, k](long first, long last) {
1888 for (long i = first; i < last; i++) {
1889 long *yp = &y.tbl[i][0];
1890 FFTRev1(yp, yp, k, i);
1891 }
1892 } );
18931908
18941909 hi = min(hi, n-1);
18951910 l = hi-lo+1;
18961911 l = max(l, 0);
1912
1913 long len = y.len;
1914 if (len <= hi) LogicError("FromFFTRep: bad len");
1915
1916 pool->exec_range(nprimes,
1917 [&y, k, len](long first, long last) {
1918 for (long i = first; i < last; i++) {
1919 long *yp = &y.tbl[i][0];
1920 FFTRev1_trunc(yp, yp, k, i, len);
1921 }
1922 } );
1923
1924
18971925 x.rep.SetLength(l);
18981926 ZZ_p *xx = x.rep.elts();
18991927
19501978 k = y.k;
19511979 n = (1L << k);
19521980
1981 if (y.len != n) LogicError("RevFromFFTRep: bad len");
1982
1983
19531984 long nprimes = FFTInfo->NumPrimes;
19541985 t.SetLength(nprimes);
19551986
19561987 for (i = 0; i < nprimes; i++) {
19571988 long *yp = &y.tbl[i][0];
1958 FFTFwd(yp, yp, k, i);
1989 FFTFwd_trans(yp, yp, k, i);
19591990 }
19601991
19611992 hi = min(hi, n-1);
19782009 {
19792010 BasicThreadPool *pool = GetThreadPool();
19802011
1981 if (!pool || pool->active() || pool->NumThreads() == 1) {
2012 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << y.k)) {
19822013 basic_RevFromFFTRep(x, y, lo, hi);
19832014 return;
19842015 }
19902021
19912022 k = y.k;
19922023 n = (1L << k);
2024
2025 if (y.len != n) LogicError("RevFromFFTRep: bad len");
19932026
19942027
19952028 pool->exec_range(nprimes,
19962029 [&y, k](long first, long last) {
19972030 for (long i = first; i < last; i++) {
19982031 long *yp = &y.tbl[i][0];
1999 FFTFwd(yp, yp, k, i);
2032 FFTFwd_trans(yp, yp, k, i);
20002033 }
20012034 } );
20022035
20562089 k = y.k;
20572090 n = (1L << k);
20582091
2092 hi = min(hi, n-1);
2093 l = hi-lo+1;
2094 l = max(l, 0);
2095
2096 long len = y.len;
2097 if (len <= hi) LogicError("FromFFTRep: bad len");
2098
20592099 z.SetSize(k);
20602100
20612101 for (i = 0; i < nprimes; i++) {
20622102 long *zp = &z.tbl[i][0];
20632103 const long *yp = &y.tbl[i][0];
20642104
2065 FFTRev1(zp, yp, k, i);
2066 }
2105 FFTRev1_trunc(zp, yp, k, i, len);
2106 }
2107
2108 x.rep.SetLength(l);
2109
2110 for (j = 0; j < l; j++) {
2111 for (i = 0; i < nprimes; i++)
2112 t[i] = z.tbl[i][j+lo];
2113
2114 FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
2115 }
2116
2117 x.normalize();
2118 }
2119
2120 #ifdef NTL_THREAD_BOOST
2121
2122 void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
2123
2124 // converts from FFT-representation to coefficient representation
2125 // only the coefficients lo..hi are computed
2126
2127
2128 {
2129 BasicThreadPool *pool = GetThreadPool();
2130
2131 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << y.k)) {
2132 basic_NDFromFFTRep(x, y, lo, hi, z);
2133 return;
2134 }
2135 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2136
2137 long k, n, l;
2138
2139 long nprimes = FFTInfo->NumPrimes;
2140
2141 k = y.k;
2142 n = (1L << k);
20672143
20682144 hi = min(hi, n-1);
20692145 l = hi-lo+1;
20702146 l = max(l, 0);
2071 x.rep.SetLength(l);
2072
2073 for (j = 0; j < l; j++) {
2074 for (i = 0; i < nprimes; i++)
2075 t[i] = z.tbl[i][j+lo];
2076
2077 FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
2078 }
2079
2080 x.normalize();
2081 }
2082
2083 #ifdef NTL_THREAD_BOOST
2084
2085 void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
2086
2087 // converts from FFT-representation to coefficient representation
2088 // only the coefficients lo..hi are computed
2089
2090
2091 {
2092 BasicThreadPool *pool = GetThreadPool();
2093
2094 if (!pool || pool->active() || pool->NumThreads() == 1) {
2095 basic_NDFromFFTRep(x, y, lo, hi, z);
2096 return;
2097 }
2098 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2099
2100 long k, n, l;
2101
2102 long nprimes = FFTInfo->NumPrimes;
2103
2104 k = y.k;
2105 n = (1L << k);
2147
2148 long len = y.len;
2149 if (len <= hi) LogicError("FromFFTRep: bad len");
21062150
21072151 z.SetSize(k);
21082152
21092153 pool->exec_range(nprimes,
2110 [&y, &z, k](long first, long last) {
2154 [&y, &z, k, len](long first, long last) {
21112155 for (long i = first; i < last; i++) {
21122156 long *zp = &z.tbl[i][0];
21132157 const long *yp = &y.tbl[i][0];
2114 FFTRev1(zp, yp, k, i);
2158 FFTRev1_trunc(zp, yp, k, i, len);
21152159 }
21162160 } );
21172161
2118 hi = min(hi, n-1);
2119 l = hi-lo+1;
2120 l = max(l, 0);
21212162 x.rep.SetLength(l);
21222163 ZZ_p *xx = x.rep.elts();
21232164
21562197 NDFromFFTRep(x, y, lo, hi, z);
21572198 }
21582199
2200
2201
21592202 NTL_TBDECL(FromFFTRep)(ZZ_p* x, FFTRep& y, long lo, long hi)
21602203
21612204 // converts from FFT-representation to coefficient representation
21732216
21742217 k = y.k;
21752218 n = (1L << k);
2219
2220 //if (y.len <= min(hi, n-1)) LogicError("FromFFTRep: bad len");
2221 if (y.len != n) LogicError("FromFFTRep: bad len");
21762222
21772223 long nprimes = FFTInfo->NumPrimes;
21782224 t.SetLength(nprimes);
22062252 {
22072253 BasicThreadPool *pool = GetThreadPool();
22082254
2209 if (!pool || pool->active() || pool->NumThreads() == 1) {
2255 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << y.k)) {
22102256 basic_FromFFTRep(x, y, lo, hi);
22112257 return;
22122258 }
22182264
22192265 k = y.k;
22202266 n = (1L << k);
2267
2268 //if (y.len <= min(hi, n-1)) LogicError("FromFFTRep: bad len");
2269 if (y.len != n) LogicError("FromFFTRep: bad len");
22212270
22222271 long nprimes = FFTInfo->NumPrimes;
22232272
22632312 #endif
22642313
22652314
2315
22662316 NTL_TBDECL(mul)(FFTRep& z, const FFTRep& x, const FFTRep& y)
22672317 {
22682318 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
22692319
2270 long k, n, i, j;
2320 long k, i, j;
22712321
22722322 if (x.k != y.k) LogicError("FFT rep mismatch");
22732323
22742324 k = x.k;
2275 n = 1L << k;
22762325
22772326 z.SetSize(k);
2327
2328 long len = z.len = min(x.len, y.len);
22782329
22792330 long nprimes = FFTInfo->NumPrimes;
22802331
22852336 long q = GetFFTPrime(i);
22862337 mulmod_t qinv = GetFFTPrimeInv(i);
22872338
2288 for (j = 0; j < n; j++)
2339 for (j = 0; j < len; j++)
22892340 zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
22902341 }
22912342
22982349 {
22992350 BasicThreadPool *pool = GetThreadPool();
23002351
2301 if (!pool || pool->active() || pool->NumThreads() == 1) {
2352 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh1(1L << x.k)) {
23022353 basic_mul(z, x, y);
23032354 return;
23042355 }
23052356 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
23062357
2307 long k, n;
2358 long k;
23082359
23092360 if (x.k != y.k) LogicError("FFT rep mismatch");
23102361
23112362 k = x.k;
2312 n = 1L << k;
23132363
23142364 z.SetSize(k);
23152365
2366 long len = z.len = min(x.len, y.len);
2367
23162368 long nprimes = FFTInfo->NumPrimes;
23172369
23182370 pool->exec_range(nprimes,
2319 [&x, &y, &z, n](long first, long last) {
2371 [&x, &y, &z, len](long first, long last) {
23202372 for (long i = first; i < last; i++) {
23212373 long *zp = &z.tbl[i][0];
23222374 const long *xp = &x.tbl[i][0];
23242376 long q = GetFFTPrime(i);
23252377 mulmod_t qinv = GetFFTPrimeInv(i);
23262378
2327 for (long j = 0; j < n; j++)
2379 for (long j = 0; j < len; j++)
23282380 zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
23292381 }
23302382 } );
23392391 {
23402392 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
23412393
2342 long k, n, i, j;
2394 long k, i, j;
23432395
23442396 if (x.k != y.k) LogicError("FFT rep mismatch");
23452397
23462398 k = x.k;
2347 n = 1L << k;
23482399
23492400 z.SetSize(k);
2401
2402 long len = z.len = min(x.len, y.len);
23502403
23512404 long nprimes = FFTInfo->NumPrimes;
23522405
23562409 const long *yp = &y.tbl[i][0];
23572410 long q = GetFFTPrime(i);
23582411
2359 for (j = 0; j < n; j++)
2412 for (j = 0; j < len; j++)
23602413 zp[j] = SubMod(xp[j], yp[j], q);
23612414 }
23622415
23692422 {
23702423 BasicThreadPool *pool = GetThreadPool();
23712424
2372 if (!pool || pool->active() || pool->NumThreads() == 1) {
2425 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh1(1L << x.k)) {
23732426 basic_sub(z, x, y);
23742427 return;
23752428 }
23762429 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
23772430
2378 long k, n;
2431 long k;
23792432
23802433 if (x.k != y.k) LogicError("FFT rep mismatch");
23812434
23822435 k = x.k;
2383 n = 1L << k;
23842436
23852437 z.SetSize(k);
23862438
2439 long len = z.len = min(x.len, y.len);
2440
23872441 long nprimes = FFTInfo->NumPrimes;
23882442
23892443 pool->exec_range(nprimes,
2390 [&x, &y, &z, n](long first, long last) {
2444 [&x, &y, &z, len](long first, long last) {
23912445 for (long i = first; i < last; i++) {
23922446 long *zp = &z.tbl[i][0];
23932447 const long *xp = &x.tbl[i][0];
23942448 const long *yp = &y.tbl[i][0];
23952449 long q = GetFFTPrime(i);
23962450
2397 for (long j = 0; j < n; j++)
2451 for (long j = 0; j < len; j++)
23982452 zp[j] = SubMod(xp[j], yp[j], q);
23992453 }
24002454 } );
24092463 {
24102464 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
24112465
2412 long k, n, i, j;
2466 long k, i, j;
24132467
24142468 if (x.k != y.k) LogicError("FFT rep mismatch");
24152469
24162470 k = x.k;
2417 n = 1L << k;
24182471
24192472 z.SetSize(k);
2473
2474 long len = z.len = min(x.len, y.len);
24202475
24212476 long nprimes = FFTInfo->NumPrimes;
24222477
24262481 const long *yp = &y.tbl[i][0];
24272482 long q = GetFFTPrime(i);
24282483
2429 for (j = 0; j < n; j++)
2484 for (j = 0; j < len; j++)
24302485 zp[j] = AddMod(xp[j], yp[j], q);
24312486 }
24322487
24392494 {
24402495 BasicThreadPool *pool = GetThreadPool();
24412496
2442 if (!pool || pool->active() || pool->NumThreads() == 1) {
2497 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh1(1L << x.k)) {
24432498 basic_add(z, x, y);
24442499 return;
24452500 }
24462501 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
24472502
2448 long k, n;
2503 long k;
24492504
24502505 if (x.k != y.k) LogicError("FFT rep mismatch");
24512506
24522507 k = x.k;
2453 n = 1L << k;
24542508
24552509 z.SetSize(k);
24562510
2511 long len = z.len = min(x.len, y.len);
2512
24572513 long nprimes = FFTInfo->NumPrimes;
24582514
24592515 pool->exec_range(nprimes,
2460 [&x, &y, &z, n](long first, long last) {
2516 [&x, &y, &z, len](long first, long last) {
24612517 for (long i = first; i < last; i++) {
24622518 long *zp = &z.tbl[i][0];
24632519 const long *xp = &x.tbl[i][0];
24642520 const long *yp = &y.tbl[i][0];
24652521 long q = GetFFTPrime(i);
24662522
2467 for (long j = 0; j < n; j++)
2523 for (long j = 0; j < len; j++)
24682524 zp[j] = AddMod(xp[j], yp[j], q);
24692525 }
24702526 } );
24722528 }
24732529
24742530 #endif
2475
24762531
24772532
24782533
24922547 n = 1L << k;
24932548
24942549 if (l < k) LogicError("reduce: bad operands");
2550 if (a.len < n) LogicError("reduce: bad len");
24952551
24962552 x.SetSize(k);
2497
2553 x.len = n;
2554
2555 if (&x == &a) return;
24982556
24992557 long nprimes = FFTInfo->NumPrimes;
25002558
25022560 ap = &a.tbl[i][0];
25032561 xp = &x.tbl[i][0];
25042562 for (j = 0; j < n; j++)
2505 xp[j] = ap[j << (l-k)];
2563 xp[j] = ap[j];
25062564 }
25072565 }
25082566
25152573 {
25162574 BasicThreadPool *pool = GetThreadPool();
25172575
2518 if (!pool || pool->active() || pool->NumThreads() == 1) {
2576 if (&x == &a || !pool || pool->active() || pool->NumThreads() == 1 || BelowThresh1(1L << k)) {
25192577 basic_reduce(x, a, k);
25202578 return;
25212579 }
25282586 n = 1L << k;
25292587
25302588 if (l < k) LogicError("reduce: bad operands");
2589 if (a.len < n) LogicError("reduce: bad len");
25312590
25322591 x.SetSize(k);
2592 x.len = n;
25332593
25342594
25352595 long nprimes = FFTInfo->NumPrimes;
25402600 const long *ap = &a.tbl[i][0];
25412601 long *xp = &x.tbl[i][0];
25422602 for (long j = 0; j < n; j++)
2543 xp[j] = ap[j << (l-k)];
2603 xp[j] = ap[j];
25442604 }
25452605 } );
25462606 }
25472607
2608
25482609 #endif
2610
25492611
25502612
25512613
25622624 n = 1L << k;
25632625
25642626 if (l < k) LogicError("AddExpand: bad args");
2627
2628 if (a.len != n) LogicError("AddExpand: bad len");
2629 if (x.len < n) LogicError("AddExpand: bad len");
25652630
25662631
25672632 long nprimes = FFTInfo->NumPrimes;
25712636 const long *ap = &a.tbl[i][0];
25722637 long *xp = &x.tbl[i][0];
25732638 for (j = 0; j < n; j++) {
2574 long j1 = j << (l-k);
2575 xp[j1] = AddMod(xp[j1], ap[j], q);
2639 xp[j] = AddMod(xp[j], ap[j], q);
25762640 }
25772641 }
25782642 }
25842648 {
25852649 BasicThreadPool *pool = GetThreadPool();
25862650
2587 if (!pool || pool->active() || pool->NumThreads() == 1) {
2651 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh1(1L << a.k)) {
25882652 basic_AddExpand(x, a);
25892653 return;
25902654 }
25972661 n = 1L << k;
25982662
25992663 if (l < k) LogicError("AddExpand: bad args");
2664
2665 if (a.len != n) LogicError("AddExpand: bad len");
2666 if (x.len < n) LogicError("AddExpand: bad len");
26002667
26012668
26022669 long nprimes = FFTInfo->NumPrimes;
26082675 const long *ap = &a.tbl[i][0];
26092676 long *xp = &x.tbl[i][0];
26102677 for (long j = 0; j < n; j++) {
2611 long j1 = j << (l-k);
2612 xp[j1] = AddMod(xp[j1], ap[j], q);
2678 xp[j] = AddMod(xp[j], ap[j], q);
26132679 }
26142680 }
26152681 } );
26162682 }
26172683
2618
26192684 #endif
2685
2686
2687
26202688
26212689
26222690
26562724 {
26572725 BasicThreadPool *pool = GetThreadPool();
26582726
2659 if (!pool || pool->active() || pool->NumThreads() == 1) {
2727 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(max(hi-lo+1,0))) {
26602728 basic_ToZZ_pXModRep(y, x, lo, hi);
26612729 return;
26622730 }
27272795
27282796
27292797 x.SetSize(k);
2798 x.len = n;
27302799
27312800 long nprimes = FFTInfo->NumPrimes;
27322801
27562825 {
27572826 BasicThreadPool *pool = GetThreadPool();
27582827
2759 if (!pool || pool->active() || pool->NumThreads() == 1) {
2828 if (!pool || pool->active() || pool->NumThreads() == 1 || BelowThresh(1L << k)) {
27602829 basic_ToFFTRep(x, a, k, lo, hi);
27612830 return;
27622831 }
27782847
27792848
27802849 x.SetSize(k);
2850 x.len = n;
27812851
27822852 long nprimes = FFTInfo->NumPrimes;
27832853
28182888 long k = a.k;
28192889 long n = 1L << k;
28202890
2891 if (a.len != n) LogicError("FromFFTRep: bad len");
2892
28212893 x.SetSize(n);
28222894 for (long i = 0; i < nprimes; i++) {
28232895 long *xp = &x.tbl[i][0];
28542926
28552927
28562928
2857
2858
2859
2860
2861
28622929 void FFTMul(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& b)
28632930 {
2864 long k, d;
2865
28662931 if (IsZero(a) || IsZero(b)) {
28672932 clear(x);
28682933 return;
28692934 }
28702935
2871 d = deg(a) + deg(b);
2872 k = NextPowerOfTwo(d+1);
2936 long da = deg(a);
2937 long db = deg(b);
2938 long d = da+db;
2939 long k = NextPowerOfTwo(d+1);
28732940
28742941 FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, k);
28752942
2876 ToFFTRep(R1, a, k);
2877 ToFFTRep(R2, b, k);
2943 ToFFTRep_trunc(R1, a, k, d+1);
2944 ToFFTRep_trunc(R2, b, k, d+1);
28782945 mul(R1, R1, R2);
28792946 FromFFTRep(x, R1, 0, d);
28802947 }
28812948
28822949 void FFTSqr(ZZ_pX& x, const ZZ_pX& a)
28832950 {
2884 long k, d;
2885
28862951 if (IsZero(a)) {
28872952 clear(x);
28882953 return;
28892954 }
28902955
2891 d = 2*deg(a);
2892 k = NextPowerOfTwo(d+1);
2956 long da = deg(a);
2957 long d = 2*da;
2958 long k = NextPowerOfTwo(d+1);
28932959
28942960 FFTRep R1(INIT_SIZE, k);
28952961
2896 ToFFTRep(R1, a, k);
2962 ToFFTRep_trunc(R1, a, k, d+1);
28972963 mul(R1, R1, R1);
28982964 FromFFTRep(x, R1, 0, d);
28992965 }
2966
29002967
29012968
29022969 void CopyReverse(ZZ_pX& x, const ZZ_pX& a, long lo, long hi)
29783045 FFTRep R1(INIT_SIZE, F.l);
29793046 ZZ_pX P1(INIT_SIZE, n);
29803047
2981 ToFFTRep(R1, a, F.l, n, 2*(n-1));
3048 ToFFTRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
29823049 mul(R1, R1, F.HRep);
29833050 FromFFTRep(P1, R1, n-2, 2*n-4);
29843051
30333100 FFTRep R1(INIT_SIZE, F.l);
30343101 ZZ_pX P1(INIT_SIZE, n), qq;
30353102
3036 ToFFTRep(R1, a, F.l, n, 2*(n-1));
3103 ToFFTRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
30373104 mul(R1, R1, F.HRep);
30383105 FromFFTRep(P1, R1, n-2, 2*n-4);
30393106 qq = P1;
30893156 FFTRep R1(INIT_SIZE, F.l);
30903157 ZZ_pX P1(INIT_SIZE, n);
30913158
3092 ToFFTRep(R1, a, F.l, n, 2*(n-1));
3159 ToFFTRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
30933160 mul(R1, R1, F.HRep);
30943161 FromFFTRep(x, R1, n-2, 2*n-4);
30953162 }
32903357 FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
32913358 ZZ_pX P1(INIT_SIZE, n);
32923359
3293 ToFFTRep(R1, a, k);
3294 ToFFTRep(R2, b, k);
3360 ToFFTRep_trunc(R1, a, k, max(1L << F.k, d));
3361 ToFFTRep_trunc(R2, b, k, max(1L << F.k, d));
32953362 mul(R1, R1, R2);
32963363 NDFromFFTRep(P1, R1, n, d-1, R2); // save R1 for future use
32973364
3298 ToFFTRep(R2, P1, F.l);
3365 ToFFTRep_trunc(R2, P1, F.l, 2*n-3);
32993366 mul(R2, R2, F.HRep);
33003367 FromFFTRep(P1, R2, n-2, 2*n-4);
33013368
33343401 FFTRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
33353402 ZZ_pX P1(INIT_SIZE, n);
33363403
3337 ToFFTRep(R1, a, k);
3404 ToFFTRep_trunc(R1, a, k, max(1L << F.k, d));
33383405 mul(R1, R1, R1);
33393406 NDFromFFTRep(P1, R1, n, d-1, R2); // save R1 for future use
33403407
3341 ToFFTRep(R2, P1, F.l);
3408 ToFFTRep_trunc(R2, P1, F.l, 2*n-3);
33423409 mul(R2, R2, F.HRep);
33433410 FromFFTRep(P1, R2, n-2, 2*n-4);
33443411
35543621 ZZ_pX P1(INIT_SIZE, n);
35553622
35563623
3557 ToFFTRep(R1, b, F.l);
3624 ToFFTRep_trunc(R1, b, F.l, 2*n-2);
35583625 reduce(x.B2, R1, F.k);
35593626 mul(R1, R1, F.HRep);
35603627 FromFFTRep(P1, R1, n-1, 2*n-3);
3628
35613629 ToFFTRep(x.B1, P1, F.l);
3630 // could be truncated to length max(1L << F.k, 2*n-2), except
3631 // for the usage in UpdateMap, where we would have to investigate
3632 // further
35623633 }
35633634
35643635
35893660 ZZ_pX P1(INIT_SIZE, n), P2(INIT_SIZE, n);
35903661 FFTRep R1(INIT_SIZE, F.l), R2(INIT_SIZE, F.l);
35913662
3592 ToFFTRep(R1, a, F.l);
3663 ToFFTRep_trunc(R1, a, F.l, max(1L << F.k, 2*n-2));
35933664 mul(R2, R1, B.B1);
35943665 FromFFTRep(P1, R2, n-1, 2*n-3);
35953666
296296 }
297297
298298
299 void mul(ZZ_pXMatrix& A, ZZ_pXMatrix& B, ZZ_pXMatrix& C)
299 void mul(ZZ_pXMatrix& A, ZZ_pXMatrix& B, ZZ_pXMatrix& C)
300300 // A = B*C, B and C are destroyed
301301 {
302302 long db = deg(B(1,1));
0 #include <NTL/ZZ_pX.h>
1 #include <NTL/ZZX.h>
2 #include <NTL/BasicThreadPool.h>
3
4 NTL_CLIENT
5
6
7 #define ITER (500)
8
9
10 void multest()
11 {
12 cerr << "mul";
13 for (long iter = 0; iter < ITER; iter++) {
14 if (iter % 100 == 0) cerr << ".";
15
16 long da, db;
17
18 if (RandomBnd(2)) {
19 da = RandomBnd(5000) + 100;
20 db = RandomBnd(5000) + 100;
21 }
22 else {
23 da = RandomBnd(200) + 1;
24 db = RandomBnd(200) + 1;
25 }
26
27 ZZ_pX a, b, c1, c2;
28
29 random(a, da);
30 random(b, db);
31
32 FFTMul(c1, a, b);
33
34 ZZX A, B, C;
35 conv(A, a);
36 conv(B, b);
37 mul(C, A, B);
38 conv(c2, C);
39
40 if (c1 != c2) {
41 cerr << "******* oops\n";
42 break;
43 }
44 }
45
46 cerr << "\n";
47 }
48
49
50 void sqrtest()
51 {
52 cerr << "sqr";
53 for (long iter = 0; iter < ITER; iter++) {
54 if (iter % 100 == 0) cerr << ".";
55
56 long da = RandomBnd(5000) + 100;
57 long db = RandomBnd(5000) + 100;
58
59 ZZ_pX a, b, c1, c2;
60
61 random(a, da);
62
63 if (deg(a) < 80) {
64 cerr << "*";
65 continue;
66 }
67
68 FFTSqr(c1, a);
69
70 ZZX A, B, C;
71 conv(A, a);
72 sqr(C, A);
73 conv(c2, C);
74
75 if (c1 != c2) {
76 cerr << "******* oops\n";
77 break;
78 }
79 }
80
81 cerr << "\n";
82 }
83
84
85
86
87 void mulmodtest()
88 {
89 cerr << "mulmod";
90 for (long iter = 0; iter < ITER; iter++) {
91 if (iter % 100 == 0) cerr << ".";
92
93 long n = RandomBnd(5000) + 300;
94 long da = RandomBnd(n)+1;
95 long db = RandomBnd(n)+1;
96
97 if (RandomBnd(2)) { da = n; db = n; }
98
99 ZZ_pX f;
100 random(f, n);
101 SetCoeff(f, n);
102 ZZ_pXModulus F(f);
103
104 ZZ_pX a, b, c1, c2;
105 random(a, da);
106 random(b, db);
107
108 MulMod(c1, a, b, F);
109
110 ZZX A, B, C;
111 conv(A, a);
112 conv(B, b);
113 mul(C, A, B);
114 conv(c2, C);
115 rem(c2, c2, F);
116
117 if (c1 != c2) {
118 cerr << "******** oops\n";
119 break;
120 }
121 }
122
123 cerr << "\n";
124 }
125
126
127 void sqrmodtest()
128 {
129 cerr << "sqrmod";
130 for (long iter = 0; iter < ITER; iter++) {
131 if (iter % 100 == 0) cerr << ".";
132
133 long n = RandomBnd(5000) + 300;
134 long da = RandomBnd(n)+1;
135 long db = RandomBnd(n)+1;
136
137 if (RandomBnd(2)) { da = n; db = n; }
138
139 ZZ_pX f;
140 random(f, n);
141 SetCoeff(f, n);
142 ZZ_pXModulus F(f);
143
144 ZZ_pX a, b, c1, c2;
145 random(a, da);
146 random(b, db);
147
148 SqrMod(c1, a, F);
149
150 ZZX A, B, C;
151 conv(A, a);
152 conv(B, b);
153 sqr(C, A);
154 conv(c2, C);
155 rem(c2, c2, F);
156
157 if (c1 != c2) {
158 cerr << "******** oops\n";
159 break;
160 }
161 }
162
163 cerr << "\n";
164 }
165
166
167
168 void mulmod1test()
169 {
170 cerr << "mulmod1";
171 for (long iter = 0; iter < ITER; iter++) {
172 if (iter % 100 == 0) cerr << ".";
173
174 long n = RandomBnd(5000) + 300;
175 long da = RandomBnd(n)+1;
176 long db = RandomBnd(n)+1;
177
178 if (RandomBnd(2)) { da = n; db = n; }
179
180 ZZ_pX f;
181 random(f, n);
182 SetCoeff(f, n);
183 ZZ_pXModulus F(f);
184
185 ZZ_pX a, b, c1, c2;
186 random(a, da);
187 random(b, db);
188
189 ZZ_pXMultiplier bb;
190 build(bb, b, F);
191
192 MulMod(c1, a, bb, F);
193
194 ZZX A, B, C;
195 conv(A, a);
196 conv(B, b);
197 mul(C, A, B);
198 conv(c2, C);
199 rem(c2, c2, F);
200
201 if (c1 != c2) {
202 cerr << "******** oops\n";
203 break;
204 }
205 }
206
207 cerr << "\n";
208 }
209
210
211 namespace NTL {
212
213 void CopyReverse(ZZ_pX& x, const ZZ_pX& a, long lo, long hi);
214
215 }
216
217
218
219 struct ZZ_pXTransMultiplier {
220 ZZ_pX f0, fbi, b;
221 long shamt, shamt_fbi, shamt_b;
222 };
223
224
225
226
227 void build(ZZ_pXTransMultiplier& B, const ZZ_pX& b, const ZZ_pXModulus& F)
228 {
229 long db = deg(b);
230
231 if (db >= F.n) LogicError("build TransMultiplier: bad args");
232
233 ZZ_pX t;
234
235 LeftShift(t, b, F.n-1);
236 div(t, t, F);
237
238 // we optimize for low degree b
239
240 long d;
241
242 d = deg(t);
243 if (d < 0)
244 B.shamt_fbi = 0;
245 else
246 B.shamt_fbi = F.n-2 - d;
247
248 CopyReverse(B.fbi, t, 0, d);
249
250 // The following code optimizes the case when
251 // f = X^n + low degree poly
252
253 trunc(t, F.f, F.n);
254 d = deg(t);
255 if (d < 0)
256 B.shamt = 0;
257 else
258 B.shamt = d;
259
260 CopyReverse(B.f0, t, 0, d);
261
262 if (db < 0)
263 B.shamt_b = 0;
264 else
265 B.shamt_b = db;
266
267 CopyReverse(B.b, b, 0, db);
268 }
269
270
271
272 void TransMulMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pXTransMultiplier& B,
273 const ZZ_pXModulus& F)
274 {
275 if (deg(a) >= F.n) LogicError("TransMulMod: bad args");
276
277 ZZ_pX t1, t2;
278
279 mul(t1, a, B.b);
280 RightShift(t1, t1, B.shamt_b);
281
282 mul(t2, a, B.f0);
283 RightShift(t2, t2, B.shamt);
284 trunc(t2, t2, F.n-1);
285
286 mul(t2, t2, B.fbi);
287 if (B.shamt_fbi > 0) LeftShift(t2, t2, B.shamt_fbi);
288 trunc(t2, t2, F.n-1);
289 LeftShift(t2, t2, 1);
290
291 sub(x, t1, t2);
292 }
293
294
295
296 void UpdateMap(vec_ZZ_p& x, const vec_ZZ_p& a,
297 const ZZ_pXTransMultiplier& B, const ZZ_pXModulus& F)
298 {
299 ZZ_pX xx;
300 TransMulMod(xx, to_ZZ_pX(a), B, F);
301 x = xx.rep;
302 }
303
304
305
306 void updatetest()
307 {
308 cerr << "update";
309 for (long iter = 0; iter < ITER; iter++) {
310 if (iter % 100 == 0) cerr << ".";
311
312 long n = RandomBnd(5000) + 300;
313 long da = RandomBnd(n)+1;
314 long db = RandomBnd(n)+1;
315
316 if (RandomBnd(2)) { da = n; db = n; }
317
318 ZZ_pX f;
319 random(f, n);
320 SetCoeff(f, n);
321 ZZ_pXModulus F(f);
322
323 ZZ_pX a, b;
324 random(a, da);
325 random(b, db);
326
327 ZZ_pXMultiplier bb1;
328 build(bb1, b, F);
329
330 ZZ_pXTransMultiplier bb2;
331 build(bb2, b, F);
332
333 Vec<ZZ_p> x1, x2;
334
335 UpdateMap(x1, a.rep, bb1, F);
336 UpdateMap(x2, a.rep, bb2, F);
337
338
339 if (x1 != x2) {
340 cerr << "******** oops\n";
341 break;
342 }
343 }
344
345 cerr << "\n";
346 }
347
348 void divremtest()
349 {
350 cerr << "divrem";
351 for (long iter = 0; iter < ITER; iter++) {
352 if (iter % 100 == 0) cerr << ".";
353
354 long n = RandomBnd(5000) + 300;
355 long dq = RandomBnd(n);
356
357
358 ZZ_pX f;
359 random(f, n);
360 SetCoeff(f, n);
361 ZZ_pXModulus F(f);
362
363 ZZ_pX a, q, r, q1, r1;
364
365 random(a, 2*n-1);
366
367 DivRem(q, r, a, F);
368 rem(r1, a, F);
369 div(q1, a, F);
370
371 if (deg(r) >= n || a != q*f + r || q != q1 || r != r1) {
372 cerr << "******** oops\n";
373 break;
374 }
375 }
376
377 cerr << "\n";
378 }
379
380
381 int main()
382 {
383 ZZ p;
384 GenPrime(p, 100);
385
386 ZZ_p::init(p);
387
388 multest();
389 sqrtest();
390 mulmodtest();
391 sqrmodtest();
392 mulmod1test();
393 divremtest();
394 updatetest();
395
396 #ifdef NTL_THREAD_BOOST
397
398 GenPrime(p, 500);
399 ZZ_p::init(p);
400
401 SetNumThreads(4);
402 cerr << "numthreads=4\n";
403
404 multest();
405 sqrtest();
406 mulmodtest();
407 sqrmodtest();
408 mulmod1test();
409 divremtest();
410 updatetest();
411
412 #endif
413
414 }
415
224224
225225 #endif
226226
227 #if @{NTL_ENABLE_AVX_FFT}
228 #define NTL_ENABLE_AVX_FFT
229
230 /*
231 * This will compile NTL in a way that enables an AVX implemention
232 * of the small-prime FFT.
233 */
234
235 #endif
236
237
238 #if @{NTL_AVOID_AVX512}
239 #define NTL_AVOID_AVX512
240
241 /*
242 * This will compile NTL in a way that avoids 512-bit operations,
243 * even if AVX512 is available.
244 */
245
246 #endif
227247
228248 #if @{NTL_RANGE_CHECK}
229249 #define NTL_RANGE_CHECK
133133 /*
134134 * On machines with wide floating point registers, the routine _ntl_ForceToMem
135135 * is used to force a floating point double to a memory location.
136 *
136 * I've checked with GCC, and even with LTO, this will work.
137 * That said, I wouln't really recommend applying LTO to NTL...
137138 */
138139
139140 void _ntl_ForceToMem(double *p)
163164
164165 double _ntl_ldexp(double x, long e)
165166 {
167 if (x == 0.0) return x;
168
166169 if (e > NTL_MAX_INT)
167170 return x/_ntl_ldexp_zero;
168171 else if (e < NTL_MIN_INT)
102102 if (nail_bits > 0)
103103 fprintf(stderr, "WARNING: GMP_NAIL_BITS > 0: this has not been well tested\n");
104104
105 if (__GNU_MP_VERSION < 5)
106 Error("GMP version 5.0.0 or later required");
105107
106108 // check that GMP_LIMB_BITS == mp_bits_per_limb as a consistency check
107109 if (GMP_LIMB_BITS != mp_bits_per_limb)
3535
3636 #ifdef NTL_GMP_LIP
3737 #include <gmp.h>
38
39 #if (__GNU_MP_VERSION < 5)
40 #error "GMP version 5.0.0 or later required"
41 #endif
42
3843 #endif
3944
4045 NTL_IMPORT_FROM_STD
4146 NTL_USE_NNS
4247
4348
49 #if (defined(NTL_HAVE_LL_TYPE) && NTL_BITS_PER_LIMB_T == NTL_BITS_PER_LONG)
50 #define NTL_VIABLE_LL
51 #endif
4452
4553
4654 #ifdef NTL_GMP_LIP
225233 {
226234 long i;
227235
228 i = 0;
229 do
230 {
231 _ntl_limb_t r = ap[i] + b;
232 rp[i] = CLIP(r);
233 b = r >> NTL_ZZ_NBITS;
234 }
235 while (++i < n);
236
237 return b;
236 if (rp != ap) {
237 i = 0;
238 do
239 {
240 _ntl_limb_t r = ap[i] + b;
241 rp[i] = CLIP(r);
242 b = r >> NTL_ZZ_NBITS;
243 }
244 while (++i < n);
245
246 return b;
247 }
248 else {
249 i = 0;
250 do
251 {
252 if (!b) return 0;
253 _ntl_limb_t r = ap[i] + b;
254 rp[i] = CLIP(r);
255 b = r >> NTL_ZZ_NBITS;
256 }
257 while (++i < n);
258
259 return b;
260 }
238261 }
239262
240263
272295 {
273296 long i;
274297
275 i = 0;
276 do
277 {
278 _ntl_limb_t r = ap[i] - b;
279 rp[i] = CLIP(r);
280 b = (r >> NTL_ZZ_NBITS) & 1;
281 }
282 while (++i < n);
283
284 return b;
298 if (rp != ap) {
299 i = 0;
300 do
301 {
302 _ntl_limb_t r = ap[i] - b;
303 rp[i] = CLIP(r);
304 b = (r >> NTL_ZZ_NBITS) & 1;
305 }
306 while (++i < n);
307
308 return b;
309 }
310 else {
311 i = 0;
312 do
313 {
314 if (!b) return 0;
315 _ntl_limb_t r = ap[i] - b;
316 rp[i] = CLIP(r);
317 b = (r >> NTL_ZZ_NBITS) & 1;
318 }
319 while (++i < n);
320
321 return b;
322 }
323
285324 }
286325
287326
824863 _ntl_mpn_base_sqr(c, a, sa);
825864 }
826865
866
867 // Like the corresponding GMP routine, this assumes un >= vn >= 1
827868 _ntl_limb_t
828869 _ntl_mpn_mul (_ntl_limb_t* rp, const _ntl_limb_t* up, long un, const _ntl_limb_t* vp, long vn)
829870 {
9861027 }
9871028 }
9881029
989
990 #else
991
992
993
994 #if (__GNU_MP_VERSION < 3)
995
996 #error "You have to use GMP version >= 3.1"
997
998 #endif
999
1000 #if ((__GNU_MP_VERSION == 3) && (__GNU_MP_VERSION_MINOR < 1))
1001
1002 #error "You have to use GMP version >= 3.1"
1003
1004 #endif
1005
1006
1007
1008 /* v 3.1 is supposed mpn_tdiv_qr defined, but it doesn't.
1009 Here's a workaround */
1010
1011 #if ((__GNU_MP_VERSION == 3) && (__GNU_MP_VERSION_MINOR == 1) && (__GNU_MP_VERSION_PATCHLEVEL == 0))
1012
1013 #define mpn_tdiv_qr __MPN(tdiv_qr)
1014
1015
1016 extern "C"
1017 void mpn_tdiv_qr(mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
1018 mp_srcptr, mp_size_t);
1019
1020 #endif
10211030
10221031 #endif
10231032
10781087
10791088 static
10801089 inline long& ALLOC(_ntl_gbigint p)
1081 { return (((long *) p)[0]); }
1090 { return p->alloc_; }
10821091
10831092 static
10841093 inline long& SIZE(_ntl_gbigint p)
1085 { return (((long *) p)[1]); }
1094 { return p->size_; }
10861095
10871096 static
10881097 inline _ntl_limb_t * DATA(_ntl_gbigint p)
1089 { return ((_ntl_limb_t *) (((long *) (p)) + 2)); }
1098 { return (_ntl_limb_t *) (p+1); }
10901099
10911100 static
10921101 inline long STORAGE(long len)
1093 { return ((long)(2*sizeof(long) + (len)*sizeof(_ntl_limb_t))); }
1102 { return ((long)(sizeof(_ntl_gbigint_body) + (len)*sizeof(_ntl_limb_t))); }
10941103
10951104 static
10961105 inline long MustAlloc(_ntl_gbigint c, long len)
11771186
11781187
11791188
1180 #if (defined(NTL_HAVE_LL_TYPE) && NTL_BITS_PER_LIMB_T == NTL_BITS_PER_LONG)
1181 #define NTL_VIABLE_LL
1182 #endif
11831189
11841190 #if (defined(NTL_CRT_ALTCODE) || defined(NTL_CRT_ALTCODE_SMALL))
11851191
13761382
13771383 len++; /* always allocate at least one more than requested */
13781384
1379 oldlen = (long) (oldlen * 1.4); /* always increase by at least 40% */
1385 oldlen = _ntl_vec_grow(oldlen);
13801386 if (len < oldlen)
13811387 len = oldlen;
13821388
16341640
16351641 if (b<0) LogicError("_ntl_gsetbit: negative index");
16361642
1637 if (ZEROP(*a)) {
1638 _ntl_gintoz(1, a);
1639 _ntl_glshift(*a, b, a);
1640 return 0;
1641 }
1642
16431643 bl = (b/NTL_ZZ_NBITS);
16441644 wh = ((_ntl_limb_t) 1) << (b - NTL_ZZ_NBITS*bl);
16451645
1646 GET_SIZE_NEG(sa, aneg, *a);
1646 if (!*a)
1647 sa = aneg = 0;
1648 else
1649 GET_SIZE_NEG(sa, aneg, *a);
16471650
16481651 if (sa > bl) {
16491652 adata = DATA(*a);
16751678
16761679 if (b<0) LogicError("_ntl_gswitchbit: negative index");
16771680
1678
1679 if (ZEROP(*a)) {
1680 _ntl_gintoz(1, a);
1681 _ntl_glshift(*a, b, a);
1682 return 0;
1683 }
1684
16851681 bl = (b/NTL_ZZ_NBITS);
16861682 wh = ((_ntl_limb_t) 1) << (b - NTL_ZZ_NBITS*bl);
16871683
1688 GET_SIZE_NEG(sa, aneg, *a);
1684 if (!*a)
1685 sa = aneg = 0;
1686 else
1687 GET_SIZE_NEG(sa, aneg, *a);
16891688
16901689 if (sa > bl) {
16911690 adata = DATA(*a);
25142513 }
25152514 }
25162515
2516
25172517 void
25182518 _ntl_gsadd(_ntl_gbigint a, long b, _ntl_gbigint *cc)
25192519 {
2520 // FIXME: this is really inefficient...too much overhead
2521 GRegister(B);
2522 _ntl_gintoz(b, &B);
2523 _ntl_gadd(a, B, cc);
2524 }
2520 if (b == 0) {
2521 _ntl_gcopy(a, cc);
2522 return;
2523 }
2524
2525 _ntl_limb_t abs_b = ABS(b);
2526
2527 if (XCLIP(abs_b)) {
2528 GRegister(xb);
2529 _ntl_gintoz(b,&xb);
2530 _ntl_gadd(a, xb, cc);
2531 return;
2532 }
2533
2534 long bneg = b < 0;
2535
2536
2537 if (ZEROP(a)) {
2538 if (!*cc) _ntl_gsetlength(cc, 1);
2539 SIZE(*cc) = 1 - 2*bneg;
2540 DATA(*cc)[0] = abs_b;
2541 return;
2542 }
2543
2544 long sa, aneg;
2545
2546 GET_SIZE_NEG(sa, aneg, a);
2547
2548 if (aneg == bneg) {
2549 // signs equal: addition
2550
2551 if (a == *cc) {
2552 // a aliases c
2553
2554 _ntl_limb_t *adata = DATA(a);
2555 _ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, abs_b);
2556
2557 if (carry) {
2558 if (MustAlloc(a, sa+1)) {
2559 _ntl_gsetlength(cc, sa+1);
2560 a = *cc;
2561 adata = DATA(a);
2562 }
2563 adata[sa] = 1;
2564 sa++;
2565 if (aneg) sa = -sa;
2566 SIZE(a) = sa;
2567 }
2568 }
2569 else {
2570 // a and c do not alias
2571 if (MustAlloc(*cc, sa+1)) _ntl_gsetlength(cc, sa+1);
2572 _ntl_limb_t *adata = DATA(a);
2573 _ntl_limb_t *cdata = DATA(*cc);
2574 _ntl_limb_t carry = NTL_MPN(add_1)(cdata, adata, sa, abs_b);
2575 if (carry) {
2576 cdata[sa] = 1;
2577 sa++;
2578 }
2579 if (aneg) sa = -sa;
2580 SIZE(*cc) = sa;
2581 }
2582 }
2583 else {
2584 // opposite sign: subtraction
2585
2586 if (sa == 1) {
2587 _ntl_limb_t abs_a = DATA(a)[0];
2588 if (abs_a == abs_b)
2589 _ntl_gzero(cc);
2590 else if (abs_a > abs_b) {
2591 if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
2592 DATA(*cc)[0] = abs_a - abs_b;
2593 SIZE(*cc) = 1-2*aneg;
2594 }
2595 else {
2596 if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
2597 DATA(*cc)[0] = abs_b - abs_a;
2598 SIZE(*cc) = -1+2*aneg;
2599 }
2600 }
2601 else {
2602 if (MustAlloc(*cc, sa)) _ntl_gsetlength(cc, sa);
2603 _ntl_limb_t *adata = DATA(a);
2604 _ntl_limb_t *cdata = DATA(*cc);
2605 NTL_MPN(sub_1)(cdata, adata, sa, abs_b);
2606 if (cdata[sa-1] == 0) sa--;
2607 if (aneg) sa = -sa;
2608 SIZE(*cc) = sa;
2609 }
2610 }
2611
2612 }
2613
2614 void
2615 _ntl_gssub(_ntl_gbigint a, long b, _ntl_gbigint *cc)
2616 {
2617 if (b == 0) {
2618 _ntl_gcopy(a, cc);
2619 return;
2620 }
2621
2622 _ntl_limb_t abs_b = ABS(b);
2623
2624 if (XCLIP(abs_b)) {
2625 GRegister(xb);
2626 _ntl_gintoz(b,&xb);
2627 _ntl_gsub(a, xb, cc);
2628 return;
2629 }
2630
2631 // the rest of this routine is precisely the same
2632 // as gsadd, except for the following line,
2633 // which has the sense of the test reversed
2634 long bneg = b >= 0;
2635
2636
2637 if (ZEROP(a)) {
2638 if (!*cc) _ntl_gsetlength(cc, 1);
2639 SIZE(*cc) = 1 - 2*bneg;
2640 DATA(*cc)[0] = abs_b;
2641 return;
2642 }
2643
2644 long sa, aneg;
2645
2646 GET_SIZE_NEG(sa, aneg, a);
2647
2648 if (aneg == bneg) {
2649 // signs equal: addition
2650
2651 if (a == *cc) {
2652 // a aliases c
2653
2654 _ntl_limb_t *adata = DATA(a);
2655 _ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, abs_b);
2656
2657 if (carry) {
2658 if (MustAlloc(a, sa+1)) {
2659 _ntl_gsetlength(cc, sa+1);
2660 a = *cc;
2661 adata = DATA(a);
2662 }
2663 adata[sa] = 1;
2664 sa++;
2665 if (aneg) sa = -sa;
2666 SIZE(a) = sa;
2667 }
2668 }
2669 else {
2670 // a and c do not alias
2671 if (MustAlloc(*cc, sa+1)) _ntl_gsetlength(cc, sa+1);
2672 _ntl_limb_t *adata = DATA(a);
2673 _ntl_limb_t *cdata = DATA(*cc);
2674 _ntl_limb_t carry = NTL_MPN(add_1)(cdata, adata, sa, abs_b);
2675 if (carry) {
2676 cdata[sa] = 1;
2677 sa++;
2678 }
2679 if (aneg) sa = -sa;
2680 SIZE(*cc) = sa;
2681 }
2682 }
2683 else {
2684 // opposite sign: subtraction
2685
2686 if (sa == 1) {
2687 _ntl_limb_t abs_a = DATA(a)[0];
2688 if (abs_a == abs_b)
2689 _ntl_gzero(cc);
2690 else if (abs_a > abs_b) {
2691 if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
2692 DATA(*cc)[0] = abs_a - abs_b;
2693 SIZE(*cc) = 1-2*aneg;
2694 }
2695 else {
2696 if (MustAlloc(*cc, 1)) _ntl_gsetlength(cc, 1);
2697 DATA(*cc)[0] = abs_b - abs_a;
2698 SIZE(*cc) = -1+2*aneg;
2699 }
2700 }
2701 else {
2702 if (MustAlloc(*cc, sa)) _ntl_gsetlength(cc, sa);
2703 _ntl_limb_t *adata = DATA(a);
2704 _ntl_limb_t *cdata = DATA(*cc);
2705 NTL_MPN(sub_1)(cdata, adata, sa, abs_b);
2706 if (cdata[sa-1] == 0) sa--;
2707 if (aneg) sa = -sa;
2708 SIZE(*cc) = sa;
2709 }
2710 }
2711
2712 }
2713
2714
25252715
25262716 void
25272717 _ntl_gsub(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)
26682858 SIZE(c) = sc;
26692859 }
26702860
2861 #if 1
2862
2863 // This version is faster for small inputs.
2864 // It avoids some overheads incurred only when dealing with
2865 // aliased outputs.
2866 // It also makes direct calls to lower-level mpn functions
2867 // for smaller inputs (and for one limb inputs, it avoids
2868 // function calls altogether (usually)).
2869
2870 // Speedup: 2.5x 1 limb
2871 // 1.4x 2 limb
2872 // 1.3x 3 limb
2873
2874 static inline _ntl_limb_t
2875 base_mul (_ntl_limb_t* rp, const _ntl_limb_t* up, long un, const _ntl_limb_t* vp, long vn)
2876 {
2877 rp[un] = NTL_MPN(mul_1) (rp, up, un, vp[0]);
2878
2879 while (--vn >= 1)
2880 {
2881 rp += 1, vp += 1;
2882 rp[un] = NTL_MPN(addmul_1) (rp, up, un, vp[0]);
2883 }
2884 return rp[un];
2885 }
2886
2887 void _ntl_gmul(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)
2888 {
2889 long sa, aneg, sb, bneg, alias, sc;
2890 _ntl_limb_t *adata, *bdata, *cdata, msl;
2891 _ntl_gbigint c;
2892
2893 if (ZEROP(a) || ZEROP(b)) {
2894 _ntl_gzero(cc);
2895 return;
2896 }
2897
2898 GET_SIZE_NEG(sa, aneg, a);
2899 GET_SIZE_NEG(sb, bneg, b);
2900
2901 if (a != *cc && b != *cc) {
2902 // no aliasing
2903
2904 c = *cc;
2905
2906 sc = sa + sb;
2907 if (MustAlloc(c, sc)) {
2908 _ntl_gsetlength(&c, sc);
2909 *cc = c;
2910 }
2911
2912 adata = DATA(a);
2913 bdata = DATA(b);
2914 cdata = DATA(c);
2915
2916 if (adata == bdata) {
2917 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
2918 if (sa == 1) {
2919 ll_type prod;
2920 ll_mul(prod, adata[0], adata[0]);
2921 cdata[0] = ll_get_lo(prod);
2922 msl = cdata[1] = ll_get_hi(prod);
2923 } else
2924 #endif
2925 {
2926 NTL_MPN(sqr)(cdata, adata, sa);
2927 msl = cdata[2*sa-1];
2928 }
2929 }
2930 else {
2931 #if 1
2932 if (sa >= sb) {
2933 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
2934 if (sa == 1) {
2935 ll_type prod;
2936 ll_mul(prod, adata[0], bdata[0]);
2937 cdata[0] = ll_get_lo(prod);
2938 msl = cdata[1] = ll_get_hi(prod);
2939 } else
2940 #endif
2941 if (sa <= 4)
2942 msl = base_mul(cdata, adata, sa, bdata, sb);
2943 else
2944 msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
2945 }
2946 else {
2947 if (sb <= 4)
2948 msl = base_mul(cdata, bdata, sb, adata, sa);
2949 else
2950 msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
2951 }
2952 #else
2953 if (sa >= sb) {
2954 msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
2955 }
2956 else {
2957 msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
2958 }
2959 #endif
2960 }
2961
2962 if (!msl) sc--;
2963 if (aneg != bneg) sc = -sc;
2964 SIZE(c) = sc;
2965 }
2966 else {
2967 // aliasing
2968 GRegister(mem);
2969
2970 c = mem;
2971
2972 sc = sa + sb;
2973 if (MustAlloc(c, sc)) {
2974 _ntl_gsetlength(&c, sc);
2975 mem = c;
2976 }
2977
2978 adata = DATA(a);
2979 bdata = DATA(b);
2980 cdata = DATA(c);
2981
2982 if (adata == bdata) {
2983 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
2984 if (sa == 1) {
2985 ll_type prod;
2986 ll_mul(prod, adata[0], adata[0]);
2987 cdata[0] = ll_get_lo(prod);
2988 msl = cdata[1] = ll_get_hi(prod);
2989 } else
2990 #endif
2991 {
2992 NTL_MPN(sqr)(cdata, adata, sa);
2993 msl = cdata[2*sa-1];
2994 }
2995 }
2996 else {
2997 #if 1
2998 if (sa >= sb) {
2999 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
3000 if (sa == 1) {
3001 ll_type prod;
3002 ll_mul(prod, adata[0], bdata[0]);
3003 cdata[0] = ll_get_lo(prod);
3004 msl = cdata[1] = ll_get_hi(prod);
3005 } else
3006 #endif
3007 if (sa <= 4)
3008 msl = base_mul(cdata, adata, sa, bdata, sb);
3009 else
3010 msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
3011 }
3012 else {
3013 if (sb <= 4)
3014 msl = base_mul(cdata, bdata, sb, adata, sa);
3015 else
3016 msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
3017 }
3018 #else
3019 if (sa >= sb) {
3020 msl = NTL_MPN(mul)(cdata, adata, sa, bdata, sb);
3021 }
3022 else {
3023 msl = NTL_MPN(mul)(cdata, bdata, sb, adata, sa);
3024 }
3025 #endif
3026 }
3027
3028 if (!msl) sc--;
3029 if (aneg != bneg) sc = -sc;
3030 SIZE(c) = sc;
3031
3032 _ntl_gcopy(mem, cc);
3033 }
3034
3035 }
3036
3037 #else
26713038 void _ntl_gmul(_ntl_gbigint a, _ntl_gbigint b, _ntl_gbigint *cc)
26723039 {
26733040 GRegister(mem);
27173084
27183085 if (alias) _ntl_gcopy(mem, cc);
27193086 }
3087 #endif
27203088
27213089 void _ntl_gsq(_ntl_gbigint a, _ntl_gbigint *cc)
27223090 {
2723 _ntl_gmul(a, a, cc);
2724 /* this is good enough...eventually, mpn_sqr_n will be called */
3091 long sa, aneg, alias, sc;
3092 _ntl_limb_t *adata, *cdata, msl;
3093 _ntl_gbigint c;
3094
3095 if (ZEROP(a)) {
3096 _ntl_gzero(cc);
3097 return;
3098 }
3099
3100 GET_SIZE_NEG(sa, aneg, a);
3101
3102 if (a != *cc) {
3103 // no aliasing
3104
3105 c = *cc;
3106
3107 sc = sa + sa;
3108 if (MustAlloc(c, sc)) {
3109 _ntl_gsetlength(&c, sc);
3110 *cc = c;
3111 }
3112
3113 adata = DATA(a);
3114 cdata = DATA(c);
3115
3116 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
3117 if (sa == 1) {
3118 ll_type prod;
3119 ll_mul(prod, adata[0], adata[0]);
3120 cdata[0] = ll_get_lo(prod);
3121 msl = cdata[1] = ll_get_hi(prod);
3122 } else
3123 #endif
3124 {
3125 NTL_MPN(sqr)(cdata, adata, sa);
3126 msl = cdata[2*sa-1];
3127 }
3128
3129 if (!msl) sc--;
3130 SIZE(c) = sc;
3131 }
3132 else {
3133 // aliasing
3134 GRegister(mem);
3135
3136 c = mem;
3137
3138 sc = sa + sa;
3139 if (MustAlloc(c, sc)) {
3140 _ntl_gsetlength(&c, sc);
3141 mem = c;
3142 }
3143
3144 adata = DATA(a);
3145 cdata = DATA(c);
3146
3147 #if (1 && defined(NTL_VIABLE_LL) && NTL_NAIL_BITS==0)
3148 if (sa == 1) {
3149 ll_type prod;
3150 ll_mul(prod, adata[0], adata[0]);
3151 cdata[0] = ll_get_lo(prod);
3152 msl = cdata[1] = ll_get_hi(prod);
3153 } else
3154 #endif
3155 {
3156 NTL_MPN(sqr)(cdata, adata, sa);
3157 msl = cdata[2*sa-1];
3158 }
3159
3160 if (!msl) sc--;
3161 SIZE(c) = sc;
3162
3163 _ntl_gcopy(mem, cc);
3164 }
3165
27253166 }
27263167
27273168
37424183 SIZE(d) = sd;
37434184 SIZE(xa) = sxa;
37444185
3745 /* Thes two ForceNormal's are work-arounds for GMP bugs
4186 #if 0
4187 // since we're now requiring GMP version 5.0.0 or later,
4188 // these workarounds are no longer required
4189
4190 /* These two ForceNormal's are work-arounds for GMP bugs
37464191 in GMP 4.3.0 */
37474192 ForceNormal(d);
37484193 ForceNormal(xa);
37874232 }
37884233
37894234 /* end normalize */
4235 #endif
37904236
37914237
37924238 if (aneg) _ntl_gnegate(&xa);
38584304 SIZE(d) = sd;
38594305 SIZE(u) = su;
38604306
4307 #if 0
4308 // since we're now requiring GMP version 5.0.0 or later,
4309 // these workarounds are no longer required
4310
38614311 /* Thes two ForceNormal's are work-arounds for GMP bugs
38624312 in GMP 4.3.0 */
38634313 ForceNormal(d);
38644314 ForceNormal(u);
4315 #endif
38654316
38664317 if (ONEP(d)) {
38674318
38704321 * GMP is sloppy.
38714322 */
38724323
4324 #if 0
4325 // since we're now requiring GMP version 5.0.0 or later,
4326 // these workarounds are no longer required
38734327
38744328 if (_ntl_gsign(u) < 0) {
38754329 _ntl_gadd(u, nin, &u);
38834337 _ntl_gmod(u, nin, &u);
38844338 }
38854339 }
4340 #else
4341 if (_ntl_gsign(u) < 0) {
4342 _ntl_gadd(u, nin, &u);
4343 }
4344
4345 #endif
38864346
38874347 _ntl_gcopy(u, invv);
38884348 return 0;
63016761 case 3: ll_mul_add(acc, row[3-1], b[3-1]);
63026762 case 2: ll_mul_add(acc, row[2-1], b[2-1]);
63036763 }
6764 #elif (CRT_ALTCODE_UNROLL)
6765 long j = n;
6766 for (; j > 16; j -= 16) {
6767 ll_mul_add(acc, row[j-1], b[j-1]);
6768 ll_mul_add(acc, row[j-2], b[j-2]);
6769 ll_mul_add(acc, row[j-3], b[j-3]);
6770 ll_mul_add(acc, row[j-4], b[j-4]);
6771 ll_mul_add(acc, row[j-5], b[j-5]);
6772 ll_mul_add(acc, row[j-6], b[j-6]);
6773 ll_mul_add(acc, row[j-7], b[j-7]);
6774 ll_mul_add(acc, row[j-8], b[j-8]);
6775 ll_mul_add(acc, row[j-9], b[j-9]);
6776 ll_mul_add(acc, row[j-10], b[j-10]);
6777 ll_mul_add(acc, row[j-11], b[j-11]);
6778 ll_mul_add(acc, row[j-12], b[j-12]);
6779 ll_mul_add(acc, row[j-13], b[j-13]);
6780 ll_mul_add(acc, row[j-14], b[j-14]);
6781 ll_mul_add(acc, row[j-15], b[j-15]);
6782 ll_mul_add(acc, row[j-16], b[j-16]);
6783 }
6784 switch (j) {
6785 case 16: ll_mul_add(acc, row[16-1], b[16-1]);
6786 case 15: ll_mul_add(acc, row[15-1], b[15-1]);
6787 case 14: ll_mul_add(acc, row[14-1], b[14-1]);
6788 case 13: ll_mul_add(acc, row[13-1], b[13-1]);
6789 case 12: ll_mul_add(acc, row[12-1], b[12-1]);
6790 case 11: ll_mul_add(acc, row[11-1], b[11-1]);
6791 case 10: ll_mul_add(acc, row[10-1], b[10-1]);
6792 case 9: ll_mul_add(acc, row[9-1], b[9-1]);
6793 case 8: ll_mul_add(acc, row[8-1], b[8-1]);
6794 case 7: ll_mul_add(acc, row[7-1], b[7-1]);
6795 case 6: ll_mul_add(acc, row[6-1], b[6-1]);
6796 case 5: ll_mul_add(acc, row[5-1], b[5-1]);
6797 case 4: ll_mul_add(acc, row[4-1], b[4-1]);
6798 case 3: ll_mul_add(acc, row[3-1], b[3-1]);
6799 case 2: ll_mul_add(acc, row[2-1], b[2-1]);
6800 }
6801
63046802 #else
63056803 for (j = 1; j < n; j++)
63066804 ll_mul_add(acc, row[j], b[j]);
71787676 case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
71797677 }
71807678
7679 #elif (TBL_UNROLL)
7680 long j = sa;
7681 for (; j > 16; j -= 16) {
7682 ll_mul_add(acc, adata[j-1], tp[j-1]);
7683 ll_mul_add(acc, adata[j-2], tp[j-2]);
7684 ll_mul_add(acc, adata[j-3], tp[j-3]);
7685 ll_mul_add(acc, adata[j-4], tp[j-4]);
7686 ll_mul_add(acc, adata[j-5], tp[j-5]);
7687 ll_mul_add(acc, adata[j-6], tp[j-6]);
7688 ll_mul_add(acc, adata[j-7], tp[j-7]);
7689 ll_mul_add(acc, adata[j-8], tp[j-8]);
7690 ll_mul_add(acc, adata[j-9], tp[j-9]);
7691 ll_mul_add(acc, adata[j-10], tp[j-10]);
7692 ll_mul_add(acc, adata[j-11], tp[j-11]);
7693 ll_mul_add(acc, adata[j-12], tp[j-12]);
7694 ll_mul_add(acc, adata[j-13], tp[j-13]);
7695 ll_mul_add(acc, adata[j-14], tp[j-14]);
7696 ll_mul_add(acc, adata[j-15], tp[j-15]);
7697 ll_mul_add(acc, adata[j-16], tp[j-16]);
7698 }
7699 switch (j) {
7700 case 16: ll_mul_add(acc, adata[16-1], tp[16-1]);
7701 case 15: ll_mul_add(acc, adata[15-1], tp[15-1]);
7702 case 14: ll_mul_add(acc, adata[14-1], tp[14-1]);
7703 case 13: ll_mul_add(acc, adata[13-1], tp[13-1]);
7704 case 12: ll_mul_add(acc, adata[12-1], tp[12-1]);
7705 case 11: ll_mul_add(acc, adata[11-1], tp[11-1]);
7706 case 10: ll_mul_add(acc, adata[10-1], tp[10-1]);
7707 case 9: ll_mul_add(acc, adata[9-1], tp[9-1]);
7708 case 8: ll_mul_add(acc, adata[8-1], tp[8-1]);
7709 case 7: ll_mul_add(acc, adata[7-1], tp[7-1]);
7710 case 6: ll_mul_add(acc, adata[6-1], tp[6-1]);
7711 case 5: ll_mul_add(acc, adata[5-1], tp[5-1]);
7712 case 4: ll_mul_add(acc, adata[4-1], tp[4-1]);
7713 case 3: ll_mul_add(acc, adata[3-1], tp[3-1]);
7714 case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
7715 }
7716
71817717 #else
71827718 long j;
71837719 for (j = 1; j < sa; j++)
84608996 }
84618997
84628998
8999 #ifdef NTL_PROVIDES_SS_LIP_IMPL
9000
9001 void
9002 _ntl_leftrotate(_ntl_gbigint *a, const _ntl_gbigint *b, long e,
9003 _ntl_gbigint p, long n, _ntl_gbigint *scratch)
9004 {
9005 if (e == 0 || ZEROP(*b)) {
9006 _ntl_gcopy(*b, a);
9007 return;
9008 }
9009
9010 long sb, nwords;
9011
9012 if (a == b || ((unsigned long) n) % NTL_ZZ_NBITS != 0 ||
9013 (sb = SIZE(*b)) == 1 + (nwords = ((unsigned long) n) / NTL_ZZ_NBITS)) {
9014
9015 _ntl_grshift(*b, n-e, scratch);
9016 _ntl_glowbits(*b, n-e, a);
9017 _ntl_glshift(*a, e, a);
9018
9019 if (_ntl_gcompare(*a, *scratch) < 0) {
9020 _ntl_gswitchbit(a, n);
9021 _ntl_gsadd(*a, 1, a);
9022 _ntl_gsubpos(*a, *scratch, a);
9023 }
9024 else {
9025 _ntl_gsubpos(*a, *scratch, a);
9026 }
9027
9028 return;
9029 }
9030
9031 long ewords = ((unsigned long) e) / NTL_ZZ_NBITS;
9032 long ebits = ((unsigned long) e) % NTL_ZZ_NBITS;
9033
9034 if (MustAlloc(*a, nwords+1)) _ntl_gsetlength(a, nwords+1);
9035
9036 _ntl_limb_t *adata = DATA(*a);
9037 _ntl_limb_t *bdata = DATA(*b);
9038
9039
9040 long special_carry = 0;
9041 long sa = 0;
9042
9043 if (ewords) {
9044 long hiwords = sb - (nwords-ewords);
9045 if (hiwords > 0) {
9046
9047 _ntl_limb_t borrow = NTL_MPN(neg)(adata, bdata + (nwords-ewords),
9048 hiwords);
9049 if (hiwords < ewords) {
9050 if (borrow) {
9051 for (long i = hiwords; i < ewords; i++)
9052 adata[i] = _ntl_limb_t(-1);
9053 }
9054 else {
9055 for (long i = hiwords; i < ewords; i++)
9056 adata[i] = 0;
9057 }
9058 }
9059
9060 if (borrow) {
9061 borrow = NTL_MPN(sub_1)(adata + ewords, bdata, nwords-ewords, 1);
9062 if (borrow) {
9063 special_carry = NTL_MPN(add_1)(adata, adata, nwords, 1);
9064 // special case: result so far is 2^n
9065 }
9066 }
9067 else {
9068 for (long i = 0; i < nwords-ewords; i++) adata[i+ewords] = bdata[i];
9069 }
9070
9071 sa = nwords;
9072 }
9073 else {
9074 for (long i = 0; i < ewords; i++) adata[i] = 0;
9075 for (long i = 0; i < sb; i++) adata[i+ewords] = bdata[i];
9076
9077 sa = ewords + sb;
9078 }
9079 }
9080 else {
9081 for (long i = 0; i < sb; i++) adata[i] = bdata[i];
9082 sa = sb;
9083 }
9084
9085 long here = 0;
9086
9087 if (ebits) {
9088 if (special_carry) {
9089 NTL_MPN(sub_1)(adata, adata, nwords, (1L << ebits) - 1L);
9090 }
9091 else if (sa == nwords) {
9092 _ntl_limb_t shout = NTL_MPN(lshift)(adata, adata, sa, ebits);
9093 if (shout) {
9094 _ntl_limb_t borrow = NTL_MPN(sub_1)(adata, adata, sa, shout);
9095 if (borrow) {
9096 _ntl_limb_t carry = NTL_MPN(add_1)(adata, adata, sa, 1);
9097 if (carry) {
9098 adata[sa] = 1;
9099 sa++;
9100 }
9101 }
9102 }
9103 }
9104 else { // sa < nwords
9105 _ntl_limb_t shout = NTL_MPN(lshift)(adata, adata, sa, ebits);
9106 if (shout) {
9107 adata[sa] = shout;
9108 sa++;
9109 }
9110 }
9111 }
9112 else {
9113 if (special_carry) {
9114 adata[sa] = 1;
9115 sa++;
9116 }
9117 }
9118
9119 STRIP(sa, adata);
9120 SIZE(*a) = sa;
9121
9122 }
9123
9124 void
9125 _ntl_ss_addmod(_ntl_gbigint *x, const _ntl_gbigint *a,
9126 const _ntl_gbigint *b, _ntl_gbigint p, long n)
9127 {
9128 if (((unsigned long) n) % NTL_ZZ_NBITS != 0) {
9129 _ntl_gadd(*a, *b, x);
9130 if (_ntl_gcompare(*x, p) >= 0) {
9131 _ntl_gsadd(*x, -1, x);
9132 _ntl_gswitchbit(x, n);
9133 }
9134 }
9135 else {
9136 _ntl_gadd(*a, *b, x);
9137 long sx, nwords;
9138 if (!*x ||
9139 (sx = SIZE(*x)) <= (nwords = ((unsigned long) n) / NTL_ZZ_NBITS))
9140 return;
9141
9142 _ntl_limb_t *xdata = DATA(*x);
9143 if (xdata[nwords] == 2) {
9144 for (long i = 0; i < nwords; i++) xdata[i] = _ntl_limb_t(-1);
9145 SIZE(*x) = nwords;
9146 return;
9147 }
9148
9149 long i = nwords-1;
9150 while (i >= 0 && xdata[i] == 0) i--;
9151 if (i < 0) return;
9152
9153 NTL_MPN(sub_1)(xdata, xdata, nwords, 1);
9154 sx = nwords;
9155 STRIP(sx, xdata);
9156 SIZE(*x) = sx;
9157 }
9158 }
9159
9160
9161 void
9162 _ntl_ss_submod(_ntl_gbigint *x, const _ntl_gbigint *a,
9163 const _ntl_gbigint *b, _ntl_gbigint p, long n)
9164 {
9165 if (((unsigned long) n) % NTL_ZZ_NBITS != 0) {
9166 if (_ntl_gcompare(*a, *b) < 0) {
9167 _ntl_gadd(*a, p, x);
9168 _ntl_gsubpos(*x, *b, x);
9169 }
9170 else {
9171 _ntl_gsubpos(*a, *b, x);
9172 }
9173 }
9174 else {
9175 if (ZEROP(*b)) {
9176 _ntl_gcopy(*a, x);
9177 return;
9178 }
9179
9180 long sb = SIZE(*b);
9181 _ntl_limb_t *bdata = DATA(*b);
9182
9183 long sa;
9184
9185 if (!*a)
9186 sa = 0;
9187 else
9188 sa = SIZE(*a);
9189
9190 long nwords = ((unsigned long) n) / NTL_ZZ_NBITS;
9191 if (MustAlloc(*x, nwords+1)) _ntl_gsetlength(x, nwords+1);
9192 _ntl_limb_t *xdata = DATA(*x);
9193
9194 if (sa >= sb) {
9195 _ntl_limb_t *adata = DATA(*a);
9196 _ntl_limb_t borrow = NTL_MPN(sub)(xdata, adata, sa, bdata, sb);
9197 if (borrow) {
9198 for (long i = sa; i < nwords; i++) xdata[i] = _ntl_limb_t(-1);
9199 _ntl_limb_t carry = NTL_MPN(add_1)(xdata, xdata, nwords, 1);
9200 if (carry) {
9201 xdata[nwords] = 1;
9202 SIZE(*x) = nwords+1;
9203 }
9204 else {
9205 long sx = nwords;
9206 STRIP(sx, xdata);
9207 SIZE(*x) = sx;
9208 }
9209 }
9210 else {
9211 long sx = sa;
9212 STRIP(sx, xdata);
9213 SIZE(*x) = sx;
9214 }
9215 }
9216 else {
9217 if (sa == 0) {
9218 xdata[0] = 1;
9219 }
9220 else {
9221 _ntl_limb_t *adata = DATA(*a);
9222 xdata[sa] = NTL_MPN(add_1)(xdata, adata, sa, 1);
9223 }
9224 for (long i = sa+1; i <= nwords; i++) xdata[i] = 0;
9225 xdata[nwords]++;
9226 _ntl_limb_t borrow = NTL_MPN(sub_n)(xdata, xdata, bdata, sb);
9227 if (borrow) {
9228 NTL_MPN(sub_1)(xdata+sb, xdata+sb, nwords+1-sb, 1);
9229 }
9230 long sx = nwords+1;
9231 STRIP(sx, xdata);
9232 SIZE(*x) = sx;
9233 }
9234 }
9235 }
9236
9237 #endif
9238
9239
9240
118118 p_info_owner.make();
119119 p_info = p_info_owner.get();
120120
121 bool bigtab = false;
121 long bigtab_index = -1;
122122 #ifdef NTL_FFT_BIGTAB
123 bigtab = true;
123 bigtab_index = 0;
124124 #endif
125 InitFFTPrimeInfo(*p_info, q, w, bigtab);
125 InitFFTPrimeInfo(*p_info, q, w, bigtab_index);
126126
127127 NumPrimes = 1;
128128 PrimeCnt = 0;
18911891 }
18921892 }
18931893
1894 void GCD(zz_pEX& x, const zz_pEX& a, const zz_pEX& b)
1894 void PlainGCD(zz_pEX& x, const zz_pEX& a, const zz_pEX& b)
18951895 {
18961896 zz_pE t;
18971897
19271927 }
19281928
19291929
1930
1931
1930 class _NTL_zz_pEXMatrix {
1931 private:
1932
1933 _NTL_zz_pEXMatrix(const _NTL_zz_pEXMatrix&); // disable
1934 zz_pEX elts[2][2];
1935
1936 public:
1937
1938 _NTL_zz_pEXMatrix() { }
1939 ~_NTL_zz_pEXMatrix() { }
1940
1941 void operator=(const _NTL_zz_pEXMatrix&);
1942 zz_pEX& operator() (long i, long j) { return elts[i][j]; }
1943 const zz_pEX& operator() (long i, long j) const { return elts[i][j]; }
1944 };
1945
1946
1947 void _NTL_zz_pEXMatrix::operator=(const _NTL_zz_pEXMatrix& M)
1948 {
1949 elts[0][0] = M.elts[0][0];
1950 elts[0][1] = M.elts[0][1];
1951 elts[1][0] = M.elts[1][0];
1952 elts[1][1] = M.elts[1][1];
1953 }
1954
1955
1956 static
1957 void mul(zz_pEX& U, zz_pEX& V, const _NTL_zz_pEXMatrix& M)
1958 // (U, V)^T = M*(U, V)^T
1959 {
1960 zz_pEX t1, t2, t3;
1961
1962 mul(t1, M(0,0), U);
1963 mul(t2, M(0,1), V);
1964 add(t3, t1, t2);
1965 mul(t1, M(1,0), U);
1966 mul(t2, M(1,1), V);
1967 add(V, t1, t2);
1968 U = t3;
1969 }
1970
1971
1972 static
1973 void mul(_NTL_zz_pEXMatrix& A, _NTL_zz_pEXMatrix& B, _NTL_zz_pEXMatrix& C)
1974 // A = B*C, B and C are destroyed
1975 {
1976 zz_pEX t1, t2;
1977
1978 mul(t1, B(0,0), C(0,0));
1979 mul(t2, B(0,1), C(1,0));
1980 add(A(0,0), t1, t2);
1981
1982 mul(t1, B(1,0), C(0,0));
1983 mul(t2, B(1,1), C(1,0));
1984 add(A(1,0), t1, t2);
1985
1986 mul(t1, B(0,0), C(0,1));
1987 mul(t2, B(0,1), C(1,1));
1988 add(A(0,1), t1, t2);
1989
1990 mul(t1, B(1,0), C(0,1));
1991 mul(t2, B(1,1), C(1,1));
1992 add(A(1,1), t1, t2);
1993
1994 long i, j;
1995 for (i = 0; i < 2; i++) {
1996 for (j = 0; j < 2; j++) {
1997 B(i,j).kill();
1998 C(i,j).kill();
1999 }
2000 }
2001 }
2002
2003
2004 void IterHalfGCD(_NTL_zz_pEXMatrix& M_out, zz_pEX& U, zz_pEX& V, long d_red)
2005 {
2006 M_out(0,0).SetMaxLength(d_red);
2007 M_out(0,1).SetMaxLength(d_red);
2008 M_out(1,0).SetMaxLength(d_red);
2009 M_out(1,1).SetMaxLength(d_red);
2010
2011 set(M_out(0,0)); clear(M_out(0,1));
2012 clear(M_out(1,0)); set(M_out(1,1));
2013
2014 long goal = deg(U) - d_red;
2015
2016 if (deg(V) <= goal)
2017 return;
2018
2019 zz_pEX Q, t(INIT_SIZE, d_red);
2020
2021 while (deg(V) > goal) {
2022 PlainDivRem(Q, U, U, V);
2023 swap(U, V);
2024
2025 mul(t, Q, M_out(1,0));
2026 sub(t, M_out(0,0), t);
2027 M_out(0,0) = M_out(1,0);
2028 M_out(1,0) = t;
2029
2030 mul(t, Q, M_out(1,1));
2031 sub(t, M_out(0,1), t);
2032 M_out(0,1) = M_out(1,1);
2033 M_out(1,1) = t;
2034 }
2035 }
2036
2037
2038 #define NTL_zz_pEX_HalfGCD_CROSSOVER (25)
2039 #define NTL_zz_pEX_GCD_CROSSOVER (275)
2040
2041
2042 void HalfGCD(_NTL_zz_pEXMatrix& M_out, const zz_pEX& U, const zz_pEX& V, long d_red)
2043 {
2044 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2045 set(M_out(0,0)); clear(M_out(0,1));
2046 clear(M_out(1,0)); set(M_out(1,1));
2047
2048 return;
2049 }
2050
2051
2052 long n = deg(U) - 2*d_red + 2;
2053 if (n < 0) n = 0;
2054
2055 zz_pEX U1, V1;
2056
2057 RightShift(U1, U, n);
2058 RightShift(V1, V, n);
2059
2060 if (d_red <= NTL_zz_pEX_HalfGCD_CROSSOVER) {
2061 IterHalfGCD(M_out, U1, V1, d_red);
2062 return;
2063 }
2064
2065 long d1 = (d_red + 1)/2;
2066 if (d1 < 1) d1 = 1;
2067 if (d1 >= d_red) d1 = d_red - 1;
2068
2069 _NTL_zz_pEXMatrix M1;
2070
2071 HalfGCD(M1, U1, V1, d1);
2072 mul(U1, V1, M1);
2073
2074 long d2 = deg(V1) - deg(U) + n + d_red;
2075
2076 if (IsZero(V1) || d2 <= 0) {
2077 M_out = M1;
2078 return;
2079 }
2080
2081
2082 zz_pEX Q;
2083 _NTL_zz_pEXMatrix M2;
2084
2085 DivRem(Q, U1, U1, V1);
2086 swap(U1, V1);
2087
2088 HalfGCD(M2, U1, V1, d2);
2089
2090 zz_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
2091
2092 mul(t, Q, M1(1,0));
2093 sub(t, M1(0,0), t);
2094 swap(M1(0,0), M1(1,0));
2095 swap(M1(1,0), t);
2096
2097 t.kill();
2098
2099 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
2100
2101 mul(t, Q, M1(1,1));
2102 sub(t, M1(0,1), t);
2103 swap(M1(0,1), M1(1,1));
2104 swap(M1(1,1), t);
2105
2106 t.kill();
2107
2108 mul(M_out, M2, M1);
2109 }
2110
2111
2112
2113
2114 void XHalfGCD(_NTL_zz_pEXMatrix& M_out, zz_pEX& U, zz_pEX& V, long d_red)
2115 {
2116 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2117 set(M_out(0,0)); clear(M_out(0,1));
2118 clear(M_out(1,0)); set(M_out(1,1));
2119
2120 return;
2121 }
2122
2123 long du = deg(U);
2124
2125 if (d_red <= NTL_zz_pEX_HalfGCD_CROSSOVER) {
2126 IterHalfGCD(M_out, U, V, d_red);
2127 return;
2128 }
2129
2130 long d1 = (d_red + 1)/2;
2131 if (d1 < 1) d1 = 1;
2132 if (d1 >= d_red) d1 = d_red - 1;
2133
2134 _NTL_zz_pEXMatrix M1;
2135
2136 HalfGCD(M1, U, V, d1);
2137 mul(U, V, M1);
2138
2139 long d2 = deg(V) - du + d_red;
2140
2141 if (IsZero(V) || d2 <= 0) {
2142 M_out = M1;
2143 return;
2144 }
2145
2146
2147 zz_pEX Q;
2148 _NTL_zz_pEXMatrix M2;
2149
2150 DivRem(Q, U, U, V);
2151 swap(U, V);
2152
2153 XHalfGCD(M2, U, V, d2);
2154
2155 zz_pEX t(INIT_SIZE, deg(M1(1,1))+deg(Q)+1);
2156
2157 mul(t, Q, M1(1,0));
2158 sub(t, M1(0,0), t);
2159 swap(M1(0,0), M1(1,0));
2160 swap(M1(1,0), t);
2161
2162 t.kill();
2163
2164 t.SetMaxLength(deg(M1(1,1))+deg(Q)+1);
2165
2166 mul(t, Q, M1(1,1));
2167 sub(t, M1(0,1), t);
2168 swap(M1(0,1), M1(1,1));
2169 swap(M1(1,1), t);
2170
2171 t.kill();
2172
2173 mul(M_out, M2, M1);
2174 }
2175
2176 void HalfGCD(zz_pEX& U, zz_pEX& V)
2177 {
2178 long d_red = (deg(U)+1)/2;
2179
2180 if (IsZero(V) || deg(V) <= deg(U) - d_red) {
2181 return;
2182 }
2183
2184 long du = deg(U);
2185
2186
2187 long d1 = (d_red + 1)/2;
2188 if (d1 < 1) d1 = 1;
2189 if (d1 >= d_red) d1 = d_red - 1;
2190
2191 _NTL_zz_pEXMatrix M1;
2192
2193 HalfGCD(M1, U, V, d1);
2194 mul(U, V, M1);
2195
2196 long d2 = deg(V) - du + d_red;
2197
2198 if (IsZero(V) || d2 <= 0) {
2199 return;
2200 }
2201
2202 M1(0,0).kill();
2203 M1(0,1).kill();
2204 M1(1,0).kill();
2205 M1(1,1).kill();
2206
2207
2208 zz_pEX Q;
2209
2210 DivRem(Q, U, U, V);
2211 swap(U, V);
2212
2213 HalfGCD(M1, U, V, d2);
2214
2215 mul(U, V, M1);
2216 }
2217
2218
2219 void GCD(zz_pEX& d, const zz_pEX& u, const zz_pEX& v)
2220 {
2221 zz_pEX u1, v1;
2222
2223 u1 = u;
2224 v1 = v;
2225
2226 if (deg(u1) == deg(v1)) {
2227 if (IsZero(u1)) {
2228 clear(d);
2229 return;
2230 }
2231
2232 rem(v1, v1, u1);
2233 }
2234 else if (deg(u1) < deg(v1)) {
2235 swap(u1, v1);
2236 }
2237
2238 // deg(u1) > deg(v1)
2239
2240 while (deg(u1) > NTL_zz_pEX_GCD_CROSSOVER && !IsZero(v1)) {
2241 HalfGCD(u1, v1);
2242
2243 if (!IsZero(v1)) {
2244 rem(u1, u1, v1);
2245 swap(u1, v1);
2246 }
2247 }
2248
2249 PlainGCD(d, u1, v1);
2250 }
2251
19322252
19332253 void XGCD(zz_pEX& d, zz_pEX& s, zz_pEX& t, const zz_pEX& a, const zz_pEX& b)
19342254 {
1935 zz_pE z;
1936
1937
1938 if (IsZero(b)) {
2255 zz_pE w;
2256
2257 if (IsZero(a) && IsZero(b)) {
2258 clear(d);
19392259 set(s);
19402260 clear(t);
1941 d = a;
1942 }
1943 else if (IsZero(a)) {
1944 clear(s);
1945 set(t);
1946 d = b;
1947 }
1948 else {
1949 long e = max(deg(a), deg(b)) + 1;
1950
1951 zz_pEX temp(INIT_SIZE, e), u(INIT_SIZE, e), v(INIT_SIZE, e),
1952 u0(INIT_SIZE, e), v0(INIT_SIZE, e),
1953 u1(INIT_SIZE, e), v1(INIT_SIZE, e),
1954 u2(INIT_SIZE, e), v2(INIT_SIZE, e), q(INIT_SIZE, e);
1955
1956
1957 set(u1); clear(v1);
1958 clear(u2); set(v2);
1959 u = a; v = b;
1960
1961 do {
1962 DivRem(q, u, u, v);
1963 swap(u, v);
1964 u0 = u2;
1965 v0 = v2;
1966 mul(temp, q, u2);
1967 sub(u2, u1, temp);
1968 mul(temp, q, v2);
1969 sub(v2, v1, temp);
1970 u1 = u0;
1971 v1 = v0;
1972 } while (!IsZero(v));
1973
1974 d = u;
1975 s = u1;
1976 t = v1;
1977 }
1978
1979 if (IsZero(d)) return;
1980 if (IsOne(LeadCoeff(d))) return;
1981
1982 /* make gcd monic */
1983
1984 inv(z, LeadCoeff(d));
1985 mul(d, d, z);
1986 mul(s, s, z);
1987 mul(t, t, z);
2261 return;
2262 }
2263
2264 zz_pEX U, V, Q;
2265
2266 U = a;
2267 V = b;
2268
2269 long flag = 0;
2270
2271 if (deg(U) == deg(V)) {
2272 DivRem(Q, U, U, V);
2273 swap(U, V);
2274 flag = 1;
2275 }
2276 else if (deg(U) < deg(V)) {
2277 swap(U, V);
2278 flag = 2;
2279 }
2280
2281 _NTL_zz_pEXMatrix M;
2282
2283 XHalfGCD(M, U, V, deg(U)+1);
2284
2285 d = U;
2286
2287 if (flag == 0) {
2288 s = M(0,0);
2289 t = M(0,1);
2290 }
2291 else if (flag == 1) {
2292 s = M(0,1);
2293 mul(t, Q, M(0,1));
2294 sub(t, M(0,0), t);
2295 }
2296 else { /* flag == 2 */
2297 s = M(0,1);
2298 t = M(0,0);
2299 }
2300
2301 // normalize
2302
2303 inv(w, LeadCoeff(d));
2304 mul(d, d, w);
2305 mul(s, s, w);
2306 mul(t, t, w);
19882307 }
19892308
19902309
0 #include <NTL/lzz_pXFactoring.h>
1 #include <NTL/lzz_pEX.h>
2
3 NTL_CLIENT
4
5
6
7 void test(zz_pX& P, zz_pEX& f, zz_pEX& g, zz_pEX& h, zz_pEX& hx, zz_pEX& s, zz_pEX& t)
8 {
9 /* P is the polynomial of the extension
10 * f and g the polynomials
11 * h the gcd
12 * hx the gcd obtained using XGCD
13 * s, t are Bezout coefficients hx=f*s+g*t
14 */
15 zz_pEX htest,rf,rg;
16
17 if (h!=hx){
18 cout << P << "\n" << f << "\n" << g << "\n";
19 Error("different gcd:\n");
20 }
21
22 if (max(deg(f), deg(g)) > 0 || min(deg(f), deg(g)) >= 0) {
23 if (deg(s) >= deg(g) || deg(t) >= deg(f)) {
24 cout << P << "\n" << f << "\n" << g << "\n";
25 Error("degree bounds at fault:\n");
26 }
27 }
28
29
30 mul(s,s,f);
31 mul(t,t,g);
32 add(htest,t,s);
33 if (h!=htest){
34 cout << P << "\n" << f << "\n" << g << "\n";
35 Error("xgcd at fault:\n");
36 }
37 if (!IsZero(h)){
38 rem(rf,f,h);
39 rem(rg,f,h);
40 if ((!IsZero(rf))||(!IsZero(rg))){
41 cout << P << "\n" << f << "\n" << g << "\n";
42 Error("not a common divisor\n");
43 }
44 }else{
45 if (!IsZero(f) && !IsZero(g)){
46 cout << "debug:\n";
47 cout << P << "\n" << f << "\n" << g << "\n" << h << "\n";
48 Error("ooops:\n");
49 }
50 }
51 }
52
53
54 int main()
55 {
56
57 long prime = 17;
58
59 zz_p::init(prime);
60
61 zz_pX P;
62
63 BuildIrred(P, 5);
64
65 zz_pE::init(P);
66
67 for (long i = 0; i < 400; i++) {
68 if (i%10 == 0) cerr << ".";
69 zz_pEX f,g,h,s,t,hx;
70
71 long deg_h;
72 if (RandomBnd(2))
73 deg_h = RandomBnd(10)+1;
74 else
75 deg_h = RandomBnd(500)+1;
76
77 random(h, deg_h);
78 SetCoeff(h, deg_h);
79
80 long deg_f;
81 if (RandomBnd(2))
82 deg_f = RandomBnd(10)+1;
83 else
84 deg_f = RandomBnd(1000)+1;
85
86 random(f, deg_f);
87 f *= h;
88
89 long deg_g;
90 if (RandomBnd(2))
91 deg_g = RandomBnd(10)+1;
92 else
93 deg_g = RandomBnd(1000)+1;
94
95 random(g, deg_g);
96 g *= h;
97
98 h = 0;
99
100 GCD(h, f, g);
101 XGCD(hx, s, t, f, g);
102 test(P, f, g, h, hx, s, t);
103 }
104
105 cerr << "\n";
106
107 }
00
11 #include <NTL/lzz_pX.h>
2 #include <NTL/FFT_impl.h>
23
34
45 NTL_START_IMPL
13971398
13981399 if (R.k < 0) {
13991400 k = -1;
1401 len = 0;
14001402 return *this;
14011403 }
14021404
14031405 DoSetSize(R.k, R.NumPrimes);
1404 long i, j, n;
1405
1406 n = 1L << k;
1406 len = R.len;
1407
1408 long i, j;
14071409
14081410 for (i = 0; i < NumPrimes; i++)
1409 for (j = 0; j < n; j++)
1411 for (j = 0; j < len; j++)
14101412 tbl[i][j] = R.tbl[i][j];
14111413
14121414 return *this;
16151617
16161618
16171619
1618 void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
1620 void TofftRep_trunc(fftRep& y, const zz_pX& x, long k,
1621 long len, long lo, long hi)
16191622 // computes an n = 2^k point convolution.
16201623 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
16211624 {
16361639 hi = min(hi, deg(x));
16371640
16381641 y.SetSize(k);
1639
16401642 n = 1L << k;
16411643
1644 y.len = len = FFTRoundUp(len, k);
1645
16421646 m = max(hi-lo + 1, 0);
1647 long ilen = FFTRoundUp(m, k);
16431648
16441649 const zz_p *xx = x.rep.elts();
16451650
16511656 for (j = 0; j < m; j++) {
16521657 yp[j] = rep(xx[j+lo]);
16531658 }
1654 for (j = m; j < n; j++) {
1659 for (j = m; j < ilen; j++) {
16551660 yp[j] = 0;
16561661 }
16571662 }
16741679 t = sp_CorrectExcess(t, q);
16751680 yp[j] = t;
16761681 }
1677 for (j = m; j < n; j++) {
1682 for (j = m; j < ilen; j++) {
16781683 yp[j] = 0;
16791684 }
16801685 }
16971702
16981703 if (p_info) {
16991704 long *yp = &y.tbl[0][0];
1700 FFTFwd(yp, yp, k, *p_info);
1705 FFTFwd_trunc(yp, yp, k, *p_info, len, ilen);
17011706 }
17021707 else {
17031708 for (i = 0; i < nprimes; i++) {
17041709 long *yp = &y.tbl[i][0];
1705 FFTFwd(yp, yp, k, i);
1710 FFTFwd_trunc(yp, yp, k, i, len, ilen);
17061711 }
17071712 }
17081713 }
17331738 y.SetSize(k);
17341739
17351740 n = 1L << k;
1741 y.len = n;
17361742
17371743 m = max(hi-lo + 1, 0);
17381744
17801786
17811787 if (p_info) {
17821788 long *yp = &y.tbl[0][0];
1783 FFTRev1(yp, yp, k, *p_info);
1789 FFTRev1_trans(yp, yp, k, *p_info);
17841790 }
17851791 else {
17861792 for (i = 0; i < info->NumPrimes; i++) {
17871793 long *yp = &y.tbl[i][0];
1788 FFTRev1(yp, yp, k, i);
1794 FFTRev1_trans(yp, yp, k, i);
17891795 }
17901796 }
17911797 }
18061812 k = y.k;
18071813 n = (1L << k);
18081814
1815 hi = min(hi, n-1);
1816 l = hi-lo+1;
1817 l = max(l, 0);
1818
1819 long len = y.len;
1820 if (len <= hi) LogicError("FromfftRep: bad len");
1821
18091822 FFTPrimeInfo *p_info = info->p_info;
18101823
18111824 if (p_info) {
18121825 long *yp = &y.tbl[0][0];
1813 FFTRev1(yp, yp, k, *p_info);
1826 FFTRev1_trunc(yp, yp, k, *p_info, len);
18141827 }
18151828 else {
18161829 for (i = 0; i < NumPrimes; i++) {
18171830 long *yp = &y.tbl[i][0];
1818 FFTRev1(yp, yp, k, i);
1819 }
1820 }
1821
1822 hi = min(hi, n-1);
1823 l = hi-lo+1;
1824 l = max(l, 0);
1831 FFTRev1_trunc(yp, yp, k, i, len);
1832 }
1833 }
1834
18251835 x.rep.SetLength(l);
18261836
18271837 if (p_info) {
18541864 k = y.k;
18551865 n = (1L << k);
18561866
1867 if (y.len != n) LogicError("RevFromfftRep: bad len");
1868
18571869 FFTPrimeInfo *p_info = info->p_info;
18581870
18591871 if (p_info) {
18601872 long *yp = &y.tbl[0][0];
1861 FFTFwd(yp, yp, k, *p_info);
1873 FFTFwd_trans(yp, yp, k, *p_info);
18621874 }
18631875 else {
18641876 for (i = 0; i < NumPrimes; i++) {
18651877 long *yp = &y.tbl[i][0];
1866 FFTFwd(yp, yp, k, i);
1878 FFTFwd_trans(yp, yp, k, i);
18671879 }
18681880 }
18691881
18941906 k = y.k;
18951907 n = (1L << k);
18961908
1909 hi = min(hi, n-1);
1910 l = hi-lo+1;
1911 l = max(l, 0);
1912
1913 long len = y.len;
1914 if (len <= hi) LogicError("FromfftRep: bad len");
1915
18971916 z.SetSize(k);
18981917
18991918 FFTPrimeInfo *p_info = info->p_info;
19011920 if (p_info) {
19021921 long *zp = &z.tbl[0][0];
19031922 const long *yp = &y.tbl[0][0];
1904 FFTRev1(zp, yp, k, *p_info);
1923 FFTRev1_trunc(zp, yp, k, *p_info, len);
19051924 }
19061925 else {
19071926 for (i = 0; i < NumPrimes; i++) {
19081927 long *zp = &z.tbl[i][0];
19091928 const long *yp = &y.tbl[i][0];
1910 FFTRev1(zp, yp, k, i);
1911 }
1912 }
1913
1914 hi = min(hi, n-1);
1915 l = hi-lo+1;
1916 l = max(l, 0);
1929 FFTRev1_trunc(zp, yp, k, i, len);
1930 }
1931 }
1932
19171933 x.rep.SetLength(l);
19181934
19191935 if (p_info) {
19501966
19511967 k = y.k;
19521968 n = (1L << k);
1969
1970
1971 //if (y.len <= min(hi, n-1)) LogicError("FromfftRep: bad len");
1972 if (y.len != n) LogicError("FromfftRep: bad len");
19531973
19541974 FFTPrimeInfo *p_info = info->p_info;
19551975
19952015
19962016 z.SetSize(k);
19972017
2018 long len = z.len = min(x.len, y.len);
2019
19982020 FFTPrimeInfo *p_info = info->p_info;
19992021
20002022 if (p_info) {
20052027 mulmod_t qinv = p_info->qinv;
20062028
20072029 if (NormalizedModulus(qinv)) {
2008 for (j = 0; j < n; j++)
2030 for (j = 0; j < len; j++)
20092031 zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
20102032 }
20112033 else {
2012 for (j = 0; j < n; j++)
2034 for (j = 0; j < len; j++)
20132035 zp[j] = MulMod(xp[j], yp[j], q, qinv);
20142036 }
20152037 }
20212043 long q = GetFFTPrime(i);
20222044 mulmod_t qinv = GetFFTPrimeInv(i);
20232045
2024 for (j = 0; j < n; j++)
2046 for (j = 0; j < len; j++)
20252047 zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
20262048 }
20272049 }
20392061 n = 1L << k;
20402062
20412063 z.SetSize(k);
2064
2065 long len = z.len = min(x.len, y.len);
20422066
20432067 FFTPrimeInfo *p_info = info->p_info;
20442068
20482072 const long *yp = &y.tbl[0][0];
20492073 long q = p_info->q;
20502074
2051 for (j = 0; j < n; j++)
2075 for (j = 0; j < len; j++)
20522076 zp[j] = SubMod(xp[j], yp[j], q);
20532077 }
20542078 else {
20582082 const long *yp = &y.tbl[i][0];
20592083 long q = GetFFTPrime(i);
20602084
2061 for (j = 0; j < n; j++)
2085 for (j = 0; j < len; j++)
20622086 zp[j] = SubMod(xp[j], yp[j], q);
20632087 }
20642088 }
20762100 n = 1L << k;
20772101
20782102 z.SetSize(k);
2103
2104 long len = z.len = min(x.len, y.len);
20792105
20802106 FFTPrimeInfo *p_info = info->p_info;
20812107
20852111 const long *yp = &y.tbl[0][0];
20862112 long q = p_info->q;
20872113
2088 for (j = 0; j < n; j++)
2114 for (j = 0; j < len; j++)
20892115 zp[j] = AddMod(xp[j], yp[j], q);
20902116 }
20912117 else {
20952121 const long *yp = &y.tbl[i][0];
20962122 long q = GetFFTPrime(i);
20972123
2098 for (j = 0; j < n; j++)
2124 for (j = 0; j < len; j++)
20992125 zp[j] = AddMod(xp[j], yp[j], q);
21002126 }
21012127 }
21162142 n = 1L << k;
21172143
21182144 if (l < k) LogicError("reduce: bad operands");
2145 if (a.len < n) LogicError("reduce: bad len");
21192146
21202147 x.SetSize(k);
2148 x.len = n;
2149
2150 if (&x == &a) return;
21212151
21222152 for (i = 0; i < info->NumPrimes; i++) {
21232153 ap = &a.tbl[i][0];
21242154 xp = &x.tbl[i][0];
21252155 for (j = 0; j < n; j++)
2126 xp[j] = ap[j << (l-k)];
2127 }
2128 }
2156 xp[j] = ap[j];
2157 }
2158 }
2159
21292160
21302161 void AddExpand(fftRep& x, const fftRep& a)
21312162 // x = x + (an "expanded" version of a)
21392170 n = 1L << k;
21402171
21412172 if (l < k) LogicError("AddExpand: bad args");
2173 if (x.len < n) LogicError("AddExpand: bad len");
21422174
21432175 FFTPrimeInfo *p_info = info->p_info;
21442176
21472179 const long *ap = &a.tbl[0][0];
21482180 long *xp = &x.tbl[0][0];
21492181 for (j = 0; j < n; j++) {
2150 long j1 = j << (l-k);
2151 xp[j1] = AddMod(xp[j1], ap[j], q);
2182 xp[j] = AddMod(xp[j], ap[j], q);
21522183 }
21532184 }
21542185 else {
21572188 const long *ap = &a.tbl[i][0];
21582189 long *xp = &x.tbl[i][0];
21592190 for (j = 0; j < n; j++) {
2160 long j1 = j << (l-k);
2161 xp[j1] = AddMod(xp[j1], ap[j], q);
2191 xp[j] = AddMod(xp[j], ap[j], q);
21622192 }
21632193 }
21642194 }
21652195 }
21662196
21672197
2168
21692198 void FFTMul(zz_pX& x, const zz_pX& a, const zz_pX& b)
21702199 {
2171 long k, d;
2172
21732200 if (IsZero(a) || IsZero(b)) {
21742201 clear(x);
21752202 return;
21762203 }
21772204
2178 d = deg(a) + deg(b);
2179 k = NextPowerOfTwo(d+1);
2205 long da = deg(a);
2206 long db = deg(b);
2207 long d = da+db;
2208 long k = NextPowerOfTwo(d+1);
21802209
21812210 fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, k);
21822211
2183 TofftRep(R1, a, k);
2184 TofftRep(R2, b, k);
2212 TofftRep_trunc(R1, a, k, d+1);
2213 TofftRep_trunc(R2, b, k, d+1);
21852214 mul(R1, R1, R2);
21862215 FromfftRep(x, R1, 0, d);
21872216 }
21882217
21892218 void FFTSqr(zz_pX& x, const zz_pX& a)
21902219 {
2191 long k, d;
2192
21932220 if (IsZero(a)) {
21942221 clear(x);
21952222 return;
21962223 }
21972224
2198 d = 2*deg(a);
2199 k = NextPowerOfTwo(d+1);
2225 long da = deg(a);
2226 long d = 2*da;
2227 long k = NextPowerOfTwo(d+1);
22002228
22012229 fftRep R1(INIT_SIZE, k);
22022230
2203 TofftRep(R1, a, k);
2231 TofftRep_trunc(R1, a, k, d+1);
22042232 mul(R1, R1, R1);
22052233 FromfftRep(x, R1, 0, d);
22062234 }
22852313 fftRep R1(INIT_SIZE, F.l);
22862314 zz_pX P1(INIT_SIZE, n);
22872315
2288 TofftRep(R1, a, F.l, n, 2*(n-1));
2316 TofftRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
22892317 mul(R1, R1, F.HRep);
22902318 FromfftRep(P1, R1, n-2, 2*n-4);
22912319
23412369 fftRep R1(INIT_SIZE, F.l);
23422370 zz_pX P1(INIT_SIZE, n), qq;
23432371
2344 TofftRep(R1, a, F.l, n, 2*(n-1));
2372 TofftRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
23452373 mul(R1, R1, F.HRep);
23462374 FromfftRep(P1, R1, n-2, 2*n-4);
23472375 qq = P1;
23972425 fftRep R1(INIT_SIZE, F.l);
23982426 zz_pX P1(INIT_SIZE, n);
23992427
2400 TofftRep(R1, a, F.l, n, 2*(n-1));
2428 TofftRep_trunc(R1, a, F.l, 2*n-3, n, 2*(n-1));
24012429 mul(R1, R1, F.HRep);
24022430 FromfftRep(x, R1, n-2, 2*n-4);
24032431 }
25972625 fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
25982626 zz_pX P1(INIT_SIZE, n);
25992627
2600 TofftRep(R1, a, k);
2601 TofftRep(R2, b, k);
2602
2628 long len;
2629 if (zz_p::IsFFTPrime())
2630 len = n;
2631 else
2632 len = 1L << F.k;
2633
2634 TofftRep_trunc(R1, a, k, max(1L << F.k, d));
2635 TofftRep_trunc(R2, b, k, max(1L << F.k, d));
26032636 mul(R1, R1, R2);
2604
26052637 NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
2606
2607 TofftRep(R2, P1, F.l);
2638
2639 TofftRep_trunc(R2, P1, F.l, 2*n-3);
26082640 mul(R2, R2, F.HRep);
26092641 FromfftRep(P1, R2, n-2, 2*n-4);
26102642
2611 TofftRep(R2, P1, F.k);
2643 TofftRep_trunc(R2, P1, F.k, len);
26122644 mul(R2, R2, F.FRep);
26132645 reduce(R1, R1, F.k);
26142646 sub(R1, R1, R2);
26432675 fftRep R1(INIT_SIZE, k), R2(INIT_SIZE, F.l);
26442676 zz_pX P1(INIT_SIZE, n);
26452677
2646 TofftRep(R1, a, k);
2678 long len;
2679 if (zz_p::IsFFTPrime())
2680 len = n;
2681 else
2682 len = 1L << F.k;
2683
2684 TofftRep_trunc(R1, a, k, max(1L << F.k, d));
26472685 mul(R1, R1, R1);
2648 NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
2649
2650 TofftRep(R2, P1, F.l);
2686 NDFromfftRep(P1, R1, n, d-1, R2); // save R1 for future use
2687
2688 TofftRep_trunc(R2, P1, F.l, 2*n-3);
26512689 mul(R2, R2, F.HRep);
26522690 FromfftRep(P1, R2, n-2, 2*n-4);
26532691
2654 TofftRep(R2, P1, F.k);
2692 TofftRep_trunc(R2, P1, F.k, len);
26552693 mul(R2, R2, F.FRep);
26562694 reduce(R1, R1, F.k);
26572695 sub(R1, R1, R2);
28642902 zz_pX P1(INIT_SIZE, n);
28652903
28662904
2867 TofftRep(R1, b, F.l);
2905 TofftRep_trunc(R1, b, F.l, 2*n-2);
28682906 reduce(x.B2, R1, F.k);
28692907 mul(R1, R1, F.HRep);
28702908 FromfftRep(P1, R1, n-1, 2*n-3);
2909
28712910 TofftRep(x.B1, P1, F.l);
2911 // could be truncated to length max(1L << F.k, 2*n-2), except
2912 // for the usage in UpdateMap, where we would have to investigate
2913 // further
2914
28722915 }
28732916
28742917
28992942 zz_pX P1(INIT_SIZE, n), P2(INIT_SIZE, n);
29002943 fftRep R1(INIT_SIZE, F.l), R2(INIT_SIZE, F.l);
29012944
2902 TofftRep(R1, a, F.l);
2945 long len;
2946 if (zz_p::IsFFTPrime())
2947 len = n;
2948 else
2949 len = 1L << F.k;
2950
2951 TofftRep_trunc(R1, a, F.l, max(1L << F.k, 2*n-2));
29032952 mul(R2, R1, B.B1);
29042953 FromfftRep(P1, R2, n-1, 2*n-3);
29052954
29062955 reduce(R1, R1, F.k);
29072956 mul(R1, R1, B.B2);
2908 TofftRep(R2, P1, F.k);
2957 TofftRep_trunc(R2, P1, F.k, len);
29092958 mul(R2, R2, F.FRep);
29102959 sub(R1, R1, R2);
29112960
0 #include <NTL/lzz_pX.h>
1
2 NTL_CLIENT
3
4 #define ITER (500)
5
6 void multest()
7 {
8 cerr << "mul";
9 for (long iter = 0; iter < ITER; iter++) {
10 if (iter % 100 == 0) cerr << ".";
11
12 long da = RandomBnd(5000) + 100;
13 long db = RandomBnd(5000) + 100;
14
15 zz_pX a, b, c1, c2;
16
17 random(a, da);
18 random(b, db);
19
20 if (deg(a) < 80 || deg(b) < 80) {
21 cerr << "*";
22 continue;
23 }
24
25 FFTMul(c1, a, b);
26 PlainMul(c2, a, b);
27
28 if (c1 != c2) {
29 cerr << "******* oops\n";
30 break;
31 }
32 }
33
34 cerr << "\n";
35 }
36
37
38 void sqrtest()
39 {
40 cerr << "sqr";
41 for (long iter = 0; iter < ITER; iter++) {
42 if (iter % 100 == 0) cerr << ".";
43
44 long da = RandomBnd(5000) + 100;
45 long db = RandomBnd(5000) + 100;
46
47 zz_pX a, b, c1, c2;
48
49 random(a, da);
50
51 if (deg(a) < 80) {
52 cerr << "*";
53 continue;
54 }
55
56 FFTSqr(c1, a);
57 PlainMul(c2, a, a);
58
59 if (c1 != c2) {
60 cerr << "******* oops\n";
61 break;
62 }
63 }
64
65 cerr << "\n";
66 }
67
68
69
70
71 void mulmodtest()
72 {
73 cerr << "mulmod";
74 for (long iter = 0; iter < ITER; iter++) {
75 if (iter % 100 == 0) cerr << ".";
76
77 long n = RandomBnd(5000) + 300;
78 long da = RandomBnd(n)+1;
79 long db = RandomBnd(n)+1;
80
81 if (RandomBnd(2)) { da = n; db = n; }
82
83 zz_pX f;
84 random(f, n);
85 SetCoeff(f, n);
86 zz_pXModulus F(f);
87
88 zz_pX a, b, c1, c2;
89 random(a, da);
90 random(b, db);
91
92 MulMod(c1, a, b, F);
93 PlainMul(c2, a, b);
94 rem(c2, c2, f);
95
96 if (c1 != c2) {
97 cerr << "******** oops\n";
98 break;
99 }
100 }
101
102 cerr << "\n";
103 }
104
105
106 void sqrmodtest()
107 {
108 cerr << "sqrmod";
109 for (long iter = 0; iter < ITER; iter++) {
110 if (iter % 100 == 0) cerr << ".";
111
112 long n = RandomBnd(5000) + 300;
113 long da = RandomBnd(n)+1;
114 long db = RandomBnd(n)+1;
115
116 if (RandomBnd(2)) { da = n; db = n; }
117
118 zz_pX f;
119 random(f, n);
120 SetCoeff(f, n);
121 zz_pXModulus F(f);
122
123 zz_pX a, b, c1, c2;
124 random(a, da);
125 random(b, db);
126
127 SqrMod(c1, a, F);
128
129 PlainMul(c2, a, a);
130 rem(c2, c2, f);
131
132 if (c1 != c2) {
133 cerr << "******** oops\n";
134 break;
135 }
136 }
137
138 cerr << "\n";
139 }
140
141
142
143 void mulmod1test()
144 {
145 cerr << "mulmod1";
146 for (long iter = 0; iter < ITER; iter++) {
147 if (iter % 100 == 0) cerr << ".";
148
149 long n = RandomBnd(5000) + 300;
150 long da = RandomBnd(n)+1;
151 long db = RandomBnd(n)+1;
152
153 if (RandomBnd(2)) { da = n; db = n; }
154
155 zz_pX f;
156 random(f, n);
157 SetCoeff(f, n);
158 zz_pXModulus F(f);
159
160 zz_pX a, b, c1, c2;
161 random(a, da);
162 random(b, db);
163
164 zz_pXMultiplier bb;
165 build(bb, b, F);
166
167 MulMod(c1, a, bb, F);
168
169 PlainMul(c2, a, b);
170 rem(c2, c2, f);
171
172 if (c1 != c2) {
173 cerr << "******** oops\n";
174 break;
175 }
176 }
177
178 cerr << "\n";
179 }
180
181
182 namespace NTL {
183
184 void CopyReverse(zz_pX& x, const zz_pX& a, long lo, long hi);
185
186 }
187
188
189
190 struct zz_pXTransMultiplier {
191 zz_pX f0, fbi, b;
192 long shamt, shamt_fbi, shamt_b;
193 };
194
195
196
197
198 void build(zz_pXTransMultiplier& B, const zz_pX& b, const zz_pXModulus& F)
199 {
200 long db = deg(b);
201
202 if (db >= F.n) LogicError("build TransMultiplier: bad args");
203
204 zz_pX t;
205
206 LeftShift(t, b, F.n-1);
207 div(t, t, F);
208
209 // we optimize for low degree b
210
211 long d;
212
213 d = deg(t);
214 if (d < 0)
215 B.shamt_fbi = 0;
216 else
217 B.shamt_fbi = F.n-2 - d;
218
219 CopyReverse(B.fbi, t, 0, d);
220
221 // The following code optimizes the case when
222 // f = X^n + low degree poly
223
224 trunc(t, F.f, F.n);
225 d = deg(t);
226 if (d < 0)
227 B.shamt = 0;
228 else
229 B.shamt = d;
230
231 CopyReverse(B.f0, t, 0, d);
232
233 if (db < 0)
234 B.shamt_b = 0;
235 else
236 B.shamt_b = db;
237
238 CopyReverse(B.b, b, 0, db);
239 }
240
241
242
243 void TransMulMod(zz_pX& x, const zz_pX& a, const zz_pXTransMultiplier& B,
244 const zz_pXModulus& F)
245 {
246 if (deg(a) >= F.n) LogicError("TransMulMod: bad args");
247
248 zz_pX t1, t2;
249
250 mul(t1, a, B.b);
251 RightShift(t1, t1, B.shamt_b);
252
253 mul(t2, a, B.f0);
254 RightShift(t2, t2, B.shamt);
255 trunc(t2, t2, F.n-1);
256
257 mul(t2, t2, B.fbi);
258 if (B.shamt_fbi > 0) LeftShift(t2, t2, B.shamt_fbi);
259 trunc(t2, t2, F.n-1);
260 LeftShift(t2, t2, 1);
261
262 sub(x, t1, t2);
263 }
264
265
266
267 void UpdateMap(vec_zz_p& x, const vec_zz_p& a,
268 const zz_pXTransMultiplier& B, const zz_pXModulus& F)
269 {
270 zz_pX xx;
271 TransMulMod(xx, to_zz_pX(a), B, F);
272 x = xx.rep;
273 }
274
275
276
277 void updatetest()
278 {
279 cerr << "update";
280 for (long iter = 0; iter < ITER; iter++) {
281 if (iter % 100 == 0) cerr << ".";
282
283 long n = RandomBnd(5000) + 300;
284 long da = RandomBnd(n)+1;
285 long db = RandomBnd(n)+1;
286
287 if (RandomBnd(2)) { da = n; db = n; }
288
289 zz_pX f;
290 random(f, n);
291 SetCoeff(f, n);
292 zz_pXModulus F(f);
293
294 zz_pX a, b;
295 random(a, da);
296 random(b, db);
297
298 zz_pXMultiplier bb1;
299 build(bb1, b, F);
300
301 zz_pXTransMultiplier bb2;
302 build(bb2, b, F);
303
304 Vec<zz_p> x1, x2;
305
306 UpdateMap(x1, a.rep, bb1, F);
307 UpdateMap(x2, a.rep, bb2, F);
308
309
310 if (x1 != x2) {
311 cerr << "******** oops\n";
312 break;
313 }
314 }
315
316 cerr << "\n";
317 }
318
319 void divremtest()
320 {
321 cerr << "divrem";
322 for (long iter = 0; iter < ITER; iter++) {
323 if (iter % 100 == 0) cerr << ".";
324
325 long n = RandomBnd(5000) + 300;
326 long dq = RandomBnd(n);
327
328
329 zz_pX f;
330 random(f, n);
331 SetCoeff(f, n);
332 zz_pXModulus F(f);
333
334 zz_pX a, q, r, q1, r1;
335
336 random(a, 2*n-1);
337
338 DivRem(q, r, a, F);
339 rem(r1, a, F);
340 div(q1, a, F);
341
342 if (deg(r) >= n || a != q*f + r || q != q1 || r != r1) {
343 cerr << "******** oops\n";
344 break;
345 }
346 }
347
348 cerr << "\n";
349 }
350
351 int main()
352 {
353 long p;
354 p = GenPrime_long(NTL_SP_NBITS);
355
356 zz_p::init(p);
357
358 multest();
359 sqrtest();
360 mulmodtest();
361 sqrmodtest();
362 mulmod1test();
363 divremtest();
364 updatetest();
365
366 zz_p::FFTInit(0);
367
368 cerr << "FFT Prime\n";
369
370 multest();
371 sqrtest();
372 mulmodtest();
373 sqrmodtest();
374 mulmod1test();
375 divremtest();
376 updatetest();
377
378 }
379
+0
-567
src/makefile less more
0 ###############################################################
1 #
2 # First, choose a C++ compiler, and set compiler flags.
3 # This is done by setting the variables CXX and CXXFLAGS.
4 #
5 ###############################################################
6
7
8
9 CXX=g++
10 # A C++ compiler, e.g., g++, CC, xlC
11
12
13 CXXFLAGS=-g -O2
14 # Flags for the C++ compiler
15
16 CXXAUTOFLAGS= -std=c++11 -pthread -march=native
17 # Flags for the C++ compiler, automatically generated by configuration script
18
19 NOCONTRACT=
20
21
22 AR=ar
23 # command to make a library
24
25 ARFLAGS=ruv
26 # arguments for AR
27
28 RANLIB=ranlib
29 # set to echo if you want to disable it completely
30
31 LDFLAGS=
32 # libraries for linking C++ programs
33
34 LDLIBS=-lm
35 # libraries for linking C++ programs
36
37 CPPFLAGS=
38 # arguments for the C preprocessor
39
40 LIBTOOL=libtool
41 # libtool command -- this is now built locally
42
43 LIBTOOL_LINK_FLAGS=
44 # flags to add to command line when building a shared library
45 # mainly used to pass the argument "-no-undefined" on cygwin
46
47 DEF_PREFIX=/usr/local
48
49 PREFIX=$(DEF_PREFIX)
50 LIBDIR=$(PREFIX)/lib
51 INCLUDEDIR=$(PREFIX)/include
52 DOCDIR=$(PREFIX)/share/doc
53 # where to install NTL
54
55 DESTDIR=
56 # added to support standard package building techniques
57 # that install into a "staging area"
58
59 ###############################################################
60 #
61 # Second, if you want to use GMP (the GNU Multi-Precision library),
62 # define the variables GMP_OPT_INCDIR, GMP_OPT_LIBDIR, GMP_OPT_LIB below.
63 # You also will have to set either NTL_GMP_LIP or NTL_GMP_HACK
64 # in the config.h file.
65 #
66 # Using GMP can lead to significant performance gains on some
67 # platforms. You can obtain GMP from http://www.swox.com/gmp.
68 # Once you unpack it into a directory, just execute
69 # ./configure; make
70 # in that directory.
71 #
72 ###############################################################
73
74
75 GMP_PREFIX=$(DEF_PREFIX)
76
77 GMP_INCDIR=$(GMP_PREFIX)/include
78 # directory containing gmp.h if using GMP
79
80 GMP_LIBDIR=$(GMP_PREFIX)/lib
81 # directory containing libgmp.a if using GMP
82
83 GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
84 GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
85 GMP_OPT_LIB=-lgmp # GMP
86 # uncomment these if using GMP
87
88
89 ###############################################################
90 #
91 # Third, if you want to use gf2x (a library for fast
92 # multiplication over GF(2)[X]), you need to
93 # define the variables GF2X_OPT_INCDIR, GF2X_OPT_LIBDIR, GF2X_OPT_LIB below.
94 # You also will have to set NTL_GF2X_LIB
95 # in the config.h file.
96 #
97 ###############################################################
98
99 GF2X_PREFIX=$(DEF_PREFIX)
100
101 GF2X_INCDIR=$(GF2X_PREFIX)/include
102 # directory containing gf2x.h if using gf2x
103
104 GF2X_LIBDIR=$(GF2X_PREFIX)/lib
105 # directory containing libgf2x.a
106
107 GF2X_OPT_INCDIR=# -I$(GF2X_INCDIR) # GF2X
108 GF2X_OPT_LIBDIR=# -L$(GF2X_LIBDIR) # GF2X
109 GF2X_OPT_LIB=# -lgf2x # GF2X
110 # uncomment these if using gf2x
111
112
113 ###############################################################
114 #
115 # Fourth, if you do not want to run the wizard that automagically
116 # sets some performace related flags in config.h, set the flag below.
117 #
118 ###############################################################
119
120
121 WIZARD=off
122 # Set to off if you want to bypass the wizard; otherwise, set to on.
123
124
125 #################################################################
126 #
127 # That's it! You can ignore everything else in this file!
128 #
129 #################################################################
130
131
132 # object files
133 OBJ=FFT.o FacVec.o GF2.o GF2E.o GF2EX.o GF2EXFactoring.o GF2X.o GF2X1.o \
134 GF2XFactoring.o GF2XVec.o GetTime.o GetPID.o HNF.o ctools.o LLL.o LLL_FP.o \
135 LLL_QP.o LLL_RR.o LLL_XD.o RR.o WordVector.o ZZ.o ZZVec.o ZZX.o ZZX1.o \
136 ZZXCharPoly.o ZZXFactoring.o ZZ_p.o ZZ_pE.o ZZ_pEX.o ZZ_pEXFactoring.o ZZ_pX.o \
137 ZZ_pX1.o ZZ_pXCharPoly.o ZZ_pXFactoring.o fileio.o lip.o lzz_p.o lzz_pE.o \
138 lzz_pEX.o lzz_pEXFactoring.o lzz_pX.o lzz_pX1.o lzz_pXCharPoly.o \
139 lzz_pXFactoring.o mat_GF2.o mat_GF2E.o mat_RR.o mat_ZZ.o mat_ZZ_p.o mat_ZZ_pE.o \
140 mat_lzz_p.o mat_lzz_pE.o mat_poly_ZZ.o mat_poly_ZZ_p.o mat_poly_lzz_p.o \
141 quad_float.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
142 vec_ZZ_pE.o vec_lzz_p.o vec_lzz_pE.o xdouble.o G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o \
143 G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o
144
145 # library source files
146 SRC=FFT.cpp FacVec.cpp GF2.cpp GF2E.cpp GF2EX.cpp GF2EXFactoring.cpp GF2X.cpp \
147 GF2X1.cpp GF2XFactoring.cpp GF2XVec.cpp HNF.cpp ctools.cpp LLL.cpp LLL_FP.cpp \
148 LLL_QP.cpp LLL_RR.cpp LLL_XD.cpp RR.cpp WordVector.cpp ZZ.cpp ZZVec.cpp ZZX.cpp \
149 ZZX1.cpp ZZXCharPoly.cpp ZZXFactoring.cpp ZZ_p.cpp ZZ_pE.cpp ZZ_pEX.cpp \
150 ZZ_pEXFactoring.cpp ZZ_pX.cpp ZZ_pX1.cpp ZZ_pXCharPoly.cpp ZZ_pXFactoring.cpp \
151 fileio.cpp lip.cpp lzz_p.cpp lzz_pE.cpp lzz_pEX.cpp lzz_pEXFactoring.cpp \
152 lzz_pX.cpp lzz_pX1.cpp lzz_pXCharPoly.cpp lzz_pXFactoring.cpp mat_GF2.cpp \
153 mat_GF2E.cpp mat_RR.cpp mat_ZZ.cpp mat_ZZ_p.cpp mat_ZZ_pE.cpp mat_lzz_p.cpp \
154 mat_lzz_pE.cpp mat_poly_ZZ.cpp mat_poly_ZZ_p.cpp mat_poly_lzz_p.cpp \
155 quad_float.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
156 vec_ZZ_p.cpp vec_ZZ_pE.cpp vec_lzz_p.cpp vec_lzz_pE.cpp xdouble.cpp \
157 G_LLL_FP.cpp G_LLL_QP.cpp G_LLL_XD.cpp G_LLL_RR.cpp thread.cpp \
158 BasicThreadPool.cpp MatPrime.cpp
159
160
161
162 # library header files
163 INCL=FFT.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
164 GF2XFactoring.h GF2XVec.h HNF.h ctools.h LLL.h RR.h WordVector.h \
165 ZZ.h ZZ_limbs.h sp_arith.h ZZVec.h ZZX.h ZZXFactoring.h ZZ_p.h ZZ_pE.h ZZ_pEX.h \
166 ZZ_pEXFactoring.h ZZ_pX.h ZZ_pXFactoring.h fileio.h lip.h lzz_p.h lzz_pE.h \
167 lzz_pEX.h lzz_pEXFactoring.h lzz_pX.h lzz_pXFactoring.h mat_GF2.h mat_GF2E.h \
168 mat_RR.h mat_ZZ.h mat_ZZ_p.h mat_ZZ_pE.h mat_lzz_p.h mat_lzz_pE.h mat_poly_ZZ.h \
169 mat_poly_ZZ_p.h mat_poly_lzz_p.h matrix.h pair.h vector.h pair_GF2EX_long.h \
170 pair_GF2X_long.h pair_ZZX_long.h pair_ZZ_pEX_long.h pair_ZZ_pX_long.h \
171 pair_lzz_pEX_long.h pair_lzz_pX_long.h quad_float.h tools.h vec_GF2.h \
172 vec_GF2E.h vec_GF2XVec.h vec_RR.h vec_ZZ.h vec_ZZVec.h vec_ZZ_p.h vec_ZZ_pE.h \
173 vec_double.h vec_long.h vec_lzz_p.h vec_lzz_pE.h vec_quad_float.h vec_vec_GF2.h \
174 vec_vec_GF2E.h vec_vec_RR.h vec_vec_ZZ.h vec_vec_ZZ_p.h vec_vec_ZZ_pE.h \
175 vec_vec_long.h vec_vec_lzz_p.h vec_vec_lzz_pE.h vec_xdouble.h xdouble.h \
176 config.h version.h new.h vec_ulong.h vec_vec_ulong.h SmartPtr.h \
177 Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h
178
179
180
181 # test data
182 TD=BerlekampTestIn BerlekampTestOut CanZassTestIn CanZassTestOut \
183 ZZXFacTestIn ZZXFacTestOut MoreFacTestIn LLLTestIn LLLTestOut RRTestIn RRTestOut \
184 MatrixTestIn MatrixTestOut CharPolyTestIn \
185 CharPolyTestOut QuadTestIn QuadTestOut
186
187
188 # test source files
189 TS=QuickTest.cpp ZZTest.cpp BerlekampTest.cpp CanZassTest.cpp ZZXFacTest.cpp \
190 MoreFacTest.cpp LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
191 CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp \
192 BitMatTest.cpp ZZ_pEXTest.cpp lzz_pEXTest.cpp Timing.cpp ThreadTest.cpp \
193 ExceptionTest.cpp
194
195 # scripts
196 SCRIPTS=MakeGetTime MakeGetPID MakeCheckFeatures ResetFeatures CopyFeatures \
197 TestScript dosify unixify RemoveProg configure DoConfig mfile cfile ppscript
198
199
200 # auxilliary source
201 MD=MakeDesc.cpp MakeDescAux.cpp newnames.cpp gen_gmp_aux.cpp gf2x_version_1_2_or_later_required.cpp
202 GT=GetTime0.cpp GetTime1.cpp GetTime2.cpp GetTime3.cpp GetTime4.cpp GetTime5.cpp TestGetTime.cpp
203 GP=GetPID1.cpp GetPID2.cpp TestGetPID.cpp
204 CH=CheckCompile.cpp GenConfigInfo.cpp CheckContract.cpp CheckContractAux.cpp \
205 CheckThreads.cpp
206
207 AUXPROGS = TestGetTime TestGetPID CheckFeatures CheckCompile GenConfigInfo CheckContract \
208 CheckThreads
209
210 FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA \
211 COPY_TRAITS1 COPY_TRAITS2 CHRONO_TIME MACOS_TIME POSIX_TIME
212
213
214 # documentation
215
216
217 DFILES=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt \
218 GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt \
219 LazyTable.txt LLL.txt RR.txt SmartPtr.txt ZZ.txt ZZ_limbs.txt ZZVec.txt ZZX.txt \
220 ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt \
221 ZZ_pXFactoring.txt conversions.txt flags.txt lzz_p.txt lzz_pE.txt lzz_pEX.txt \
222 lzz_pEXFactoring.txt lzz_pX.txt lzz_pXFactoring.txt mat_GF2.txt mat_GF2E.txt \
223 mat_RR.txt mat_ZZ.txt mat_ZZ_p.txt mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt \
224 mat_poly_ZZ.txt mat_poly_ZZ_p.txt mat_poly_lzz_p.txt matrix.txt pair.txt \
225 vector.txt quad_float.txt sedscript.txt tools.txt vec_GF2.txt vec_GF2E.txt \
226 vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt vec_lzz_pE.txt \
227 xdouble.txt names.txt tour-ack.html tour-intro.html tour-time.html \
228 tour-changes.html tour-modules.html tour-unix.html tour-examples.html \
229 tour-roadmap.html tour-win.html tour-impl.html tour-struct.html tour.html \
230 tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html tour-ex5.html \
231 tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif tour-gmp.html \
232 tour-gf2x.html tour-tips.html config.txt version.txt
233
234
235
236 TXFILES=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt \
237 GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt \
238 SmartPtr.txt ZZ.txt ZZ_limbs.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt \
239 ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt lzz_p.txt \
240 lzz_pE.txt lzz_pEX.txt lzz_pEXFactoring.txt lzz_pX.txt lzz_pXFactoring.txt \
241 mat_GF2.txt mat_GF2E.txt mat_RR.txt mat_ZZ.txt mat_ZZ_p.txt mat_ZZ_pE.txt \
242 mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt \
243 mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt \
244 vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt \
245 vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
246
247
248 HTFILES=GF2.cpp.html GF2E.cpp.html GF2EX.cpp.html GF2EXFactoring.cpp.html \
249 GF2X.cpp.html GF2XFactoring.cpp.html GF2XVec.cpp.html HNF.cpp.html \
250 Lazy.cpp.html LazyTable.cpp.html LLL.cpp.html RR.cpp.html SmartPtr.cpp.html \
251 ZZ.cpp.html ZZ_limbs.cpp.html ZZVec.cpp.html ZZX.cpp.html ZZXFactoring.cpp.html ZZ_p.cpp.html \
252 ZZ_pE.cpp.html ZZ_pEX.cpp.html ZZ_pEXFactoring.cpp.html ZZ_pX.cpp.html \
253 ZZ_pXFactoring.cpp.html lzz_p.cpp.html lzz_pE.cpp.html lzz_pEX.cpp.html \
254 lzz_pEXFactoring.cpp.html lzz_pX.cpp.html lzz_pXFactoring.cpp.html \
255 mat_GF2.cpp.html mat_GF2E.cpp.html mat_RR.cpp.html mat_ZZ.cpp.html \
256 mat_ZZ_p.cpp.html mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html \
257 mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html mat_poly_lzz_p.cpp.html \
258 matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html \
259 vec_GF2.cpp.html vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html \
260 vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html vec_lzz_pE.cpp.html \
261 vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
262
263
264
265 DOC = $(DFILES) $(HTFILES)
266
267
268 # test program executables
269 PROGS=QuickTest ZZTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest \
270 BitMatTest MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest GF2XTest \
271 GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
272
273 # things to save to a tar file
274 SFILES=makefile $(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win \
275 Poly1TimeTest.cpp Poly2TimeTest.cpp Poly3TimeTest.cpp GF2XTimeTest.cpp \
276 InitSettings.cpp DispSettings.cpp WizardAux Wizard
277
278
279 #################################################################
280 #
281 # Rules for compiling the library
282 #
283 #################################################################
284
285
286 NTL_INCLUDE = -I../include -I.
287 # NTL needs this to find its include files
288
289 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(CXXAUTOFLAGS) -c
290
291 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(CXXAUTOFLAGS) $(LDFLAGS)
292
293
294
295 # 'make' or 'make all' does a complete make, including additional
296 # setup not done in configure.
297
298 # The file setup-phase is removed by the configure script
299 # when it terminates successfully.
300
301 # The file need-to-run-configure is removed by the configure script
302 # before making any changes to makefile/config.h and is recreated
303 # when it terminates successfully.
304
305 all: setup-phase
306 $(MAKE) ntl.a
307
308 setup-phase: need-to-run-configure
309 $(MAKE) clobber
310 $(MAKE) setup1
311 $(MAKE) setup2
312 $(MAKE) setup3
313 $(MAKE) setup4
314 touch setup-phase
315
316
317 # setup1 generates the file ../incluse/NTL/mach_desc.h
318
319 setup1:
320 $(COMPILE) MakeDescAux.cpp
321 $(LINK) -o MakeDesc MakeDesc.cpp MakeDescAux.o $(LDLIBS)
322 ./MakeDesc
323 mv mach_desc.h ../include/NTL/mach_desc.h
324
325
326 # setup2 does some dynamic checks for GetTime, GetPID, and other features
327
328 setup2:
329 echo "*** CheckFeatures log ***" > CheckFeatures.log
330 sh MakeGetTime "$(LINK)" "$(LDLIBS)"
331 sh MakeGetPID "$(LINK)" "$(LDLIBS)"
332 sh MakeCheckFeatures "$(FEATURES)" "$(LINK)" "$(LDLIBS)"
333
334 # NOTE: to add a feature XXX:
335 # * add a program CheckXXX.cpp which returns 0 if XXX works, -1 otherwise
336 # * add XXX to the FEATURES variable
337
338 # setup3 generates the file ../include/NTL/gmp_aux.h
339 # The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h
340 # when NTL_GMP_LIP is set.
341 # When this flag is not set, an empty files produced.
342 # This also checks that the right version of gf2x library.
343
344 setup3:
345 $(LINK) $(GMP_OPT_INCDIR) -o gen_gmp_aux gen_gmp_aux.cpp $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
346 ./gen_gmp_aux > ../include/NTL/gmp_aux.h
347 $(LINK) $(GF2X_OPT_INCDIR) -o gf2x_version_1_2_or_later_required gf2x_version_1_2_or_later_required.cpp $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS)
348
349 # setup4 runs the wizard
350
351 setup4:
352 sh Wizard $(WIZARD) "$(MAKE)" "$(FEATURES)"
353
354
355 ntl.a: $(OBJ)
356 $(AR) $(ARFLAGS) ntl.a $(OBJ) #LSTAT
357 - $(RANLIB) ntl.a #LSTAT
358 # $(LIBTOOL) --tag=CXX --mode=link $(LINK) $(LIBTOOL_LINK_FLAGS) -o libntl.la $(OBJ:.o=.lo) $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS) -rpath $(LIBDIR) -version-info `cat VERSION_INFO` #LSHAR
359
360 LCOMP= #LSTAT
361 # LCOMP=$(LIBTOOL) --tag=CXX --mode=compile #LSHAR
362
363 lip.o: lip.cpp
364 $(LCOMP) $(COMPILE) $(GMP_OPT_INCDIR) lip.cpp
365
366 GF2X.o: GF2X.cpp
367 $(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) GF2X.cpp
368
369 quad_float.o: quad_float.cpp
370 $(LCOMP) $(COMPILE) $(NOCONTRACT) quad_float.cpp
371
372 CheckCompile: CheckCompile.cpp
373 $(LINK) -o CheckCompile CheckCompile.cpp $(LDLIBS)
374
375 GenConfigInfo: GenConfigInfo.cpp
376 $(LINK) -o GenConfigInfo GenConfigInfo.cpp $(LDLIBS)
377
378 CheckContract: CheckContract.cpp CheckContractAux.cpp
379 $(LINK) $(NOCONTRACT) -o CheckContract CheckContract.cpp CheckContractAux.cpp $(LDLIBS)
380
381 CheckThreads: CheckThreads.cpp
382 $(LINK) -o CheckThreads CheckThreads.cpp $(LDLIBS)
383
384
385 .cpp.o:
386 $(LCOMP) $(COMPILE) $<
387
388 .cpp:
389 $(LINK) -o $@ $< ntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(GF2X_OPT_LIBDIR) $(GF2X_OPT_LIB) $(LDLIBS) #LSTAT
390 # $(LIBTOOL) --tag=CXX --mode=link $(LINK) -o $@ $< libntl.la #LSHAR
391
392 #################################################################
393 #
394 # Rule for running tests
395 # make check runs a series of tests
396 #
397 #################################################################
398
399 check:
400 sh RemoveProg $(PROGS)
401 $(MAKE) QuickTest
402 ./QuickTest
403 sh RemoveProg QuickTest
404 sh TestScript "$(MAKE)"
405
406 #################################################################
407 #
408 # Rule for installing
409 # make install just does a simple copy of the include file
410 # and library. The -p option is used to preserve file attributes.
411 # This helps avoid some problems (especially when copying ntl.a).
412 # Also, an attempt is made to make everything that is
413 # installed readable by everyone.
414 #
415 # make uninstall removes these files
416 #
417 #################################################################
418
419
420
421
422 install:
423 mkdir -p -m 755 $(DESTDIR)$(INCLUDEDIR)
424 rm -rf $(DESTDIR)$(INCLUDEDIR)/NTL
425 mkdir -m 755 $(DESTDIR)$(INCLUDEDIR)/NTL
426 cp -p ../include/NTL/*.h $(DESTDIR)$(INCLUDEDIR)/NTL
427 - chmod -R a+r $(DESTDIR)$(INCLUDEDIR)/NTL
428 mkdir -p -m 755 $(DESTDIR)$(DOCDIR)
429 rm -rf $(DESTDIR)$(DOCDIR)/NTL
430 mkdir -m 755 $(DESTDIR)$(DOCDIR)/NTL
431 cp -p ../doc/*.txt $(DESTDIR)$(DOCDIR)/NTL
432 cp -p ../doc/*.html $(DESTDIR)$(DOCDIR)/NTL
433 cp -p ../doc/*.gif $(DESTDIR)$(DOCDIR)/NTL
434 - chmod -R a+r $(DESTDIR)$(DOCDIR)/NTL
435 mkdir -p -m 755 $(DESTDIR)$(LIBDIR)
436 cp -p ntl.a $(DESTDIR)$(LIBDIR)/libntl.a #LSTAT
437 - chmod a+r $(DESTDIR)$(LIBDIR)/libntl.a #LSTAT
438 # $(LIBTOOL) --mode=install cp -p libntl.la $(DESTDIR)$(LIBDIR) #LSHAR
439
440
441 uninstall:
442 rm -f $(LIBDIR)/libntl.a #LSTAT
443 # $(LIBTOOL) --mode=uninstall rm -f $(LIBDIR)/libntl.la #LSHAR
444 rm -rf $(INCLUDEDIR)/NTL
445 rm -rf $(DOCDIR)/NTL
446
447 #################################################################
448 #
449 # Rules for cleaning up
450 #
451 # make clobber removes *everything* created by make,
452 # but it does not restore config.h to its default.
453 #
454 # make clean tidies up a bit
455 #
456 #################################################################
457
458 clobber:
459 rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.cpp GetPID.cpp
460 sh ResetFeatures '..' "$(FEATURES)"
461 rm -f ../include/NTL/gmp_aux.h
462 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux gf2x_version_1_2_or_later_required
463 rm -f *.o
464 rm -rf small
465 rm -f cfileout mfileout
466 rm -rf .libs *.lo libntl.la
467 rm -f setup-phase
468
469 clean:
470 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux gf2x_version_1_2_or_later_required
471 rm -f *.o
472 rm -rf small
473 # - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR
474
475 wclean:
476 rm -f *.o
477
478 #################################################################
479 #
480 # Rules for making tar and zip files
481 #
482 # make libtool-gen-origin generates the directory
483 # libtool-origin used to include in the distribution
484 # - this only needs to be run very occasionally, to keep
485 # libtool relatively up-to-date
486 # - it must be run on a machine with autotools
487 #
488 # make ppdoc creates pretty-printed versions of some documentation
489 # - run before make package or make winpack
490 #
491 # make package creates a tar.gz file suitable for Unix
492 #
493 # make winpack creates a zip file suitable for Windows
494 #
495 #################################################################
496
497 libtool-gen-origin:
498 rm -rf libtool-origin && \
499 cp -R libtool-seed libtool-origin && \
500 cd libtool-origin && autoreconf -fiv && rm -rf autom4te.cache
501
502 ppdoc:
503 sh ppscript "$(TXFILES)"
504
505 ppclean:
506 rm -f ../doc/*.cpp
507
508
509 package:
510 sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)" "$(FEATURES)"
511 rm -rf `cat DIRNAME`
512 rm -f `cat DIRNAME`.tar
513 rm -f `cat DIRNAME`.tar.gz
514 mv unix `cat DIRNAME`
515 chmod -R a+rX `cat DIRNAME`
516 tar -cvf `cat DIRNAME`.tar `cat DIRNAME`
517 gzip `cat DIRNAME`.tar
518 rm -rf `cat DIRNAME`
519
520 winpack:
521 ./configure --nowrite NTL_GMP_LIP=off NTL_TLS_HACK=off
522 sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(FEATURES)"
523 rm -rf `cat WINDIR`
524 rm -f `cat WINDIR`.zip
525 mv dos `cat WINDIR`
526 chmod -R a+rX `cat WINDIR`
527 find ./`cat WINDIR` '!' -name '*.gif' -print | zip -l `cat WINDIR` -@
528 find ./`cat WINDIR` -name '*.gif' -print | zip -u `cat WINDIR` -@
529 rm -rf `cat WINDIR`
530
531
532 ######################################################################
533 #
534 # config wizard related stuff
535 #
536 ######################################################################
537
538 WOBJ=FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o ZZ_pX1.o \
539 lip.o tools.o vec_ZZ.o vec_ZZ_p.o GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o \
540 thread.o BasicThreadPool.o fileio.o
541
542
543 # wntl.a: LCOMP= #LSHAR
544 wntl.a: $(WOBJ)
545 $(AR) $(ARFLAGS) wntl.a $(WOBJ)
546 - $(RANLIB) wntl.a
547
548
549 Poly1TimeTest:
550 $(LINK) -o Poly1TimeTest Poly1TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
551 Poly2TimeTest:
552 $(LINK) -o Poly2TimeTest Poly2TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
553 Poly3TimeTest:
554 $(LINK) -o Poly3TimeTest Poly3TimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
555
556
557 GF2XTimeTest:
558 $(LINK) -o GF2XTimeTest GF2XTimeTest.cpp wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
559
560 InitSettings:
561 $(LINK) -o InitSettings InitSettings.cpp $(LDLIBS)
562
563
564 DispSettings:
565 $(LINK) -o DispSettings DispSettings.cpp $(LDLIBS)
566
619619 //
620620 // ******************************************************************
621621
622 //#undef NTL_HAVE_AVX
623 //#undef NTL_HAVE_FMA
624 //#undef NTL_HAVE_AVX512F
625 // for testing purposes
626
627 #if (defined(NTL_HAVE_AVX512F) && defined(NTL_AVOID_AVX512))
628 #undef NTL_HAVE_AVX512F
629 #endif
630
622631 #define MAT_BLK_SZ (32)
623632
624633
643652 #else
644653 #define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
645654 #endif
655
656
657 #ifdef NTL_HAVE_AVX512F
658 #define MUL_ADD512(a, b, c) a = _mm512_fmadd_pd(b, c, a)
659 #endif
660
661
662
663 #ifdef NTL_HAVE_AVX512F
664
665 static
666 void muladd1_by_32(double *x, const double *a, const double *b, long n)
667 {
668 __m512d avec0, bvec;
669
670 __m512d acc00, acc01, acc02, acc03;
671
672 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
673 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
674 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
675 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
676
677 for (long i = 0; i < n; i++) {
678 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
679
680 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
681 MUL_ADD512(acc00, avec0, bvec);
682
683 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
684 MUL_ADD512(acc01, avec0, bvec);
685
686 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
687 MUL_ADD512(acc02, avec0, bvec);
688
689 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
690 MUL_ADD512(acc03, avec0, bvec);
691 }
692
693
694 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
695 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
696 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
697 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
698
699 }
700
701 static
702 void muladd2_by_32(double *x, const double *a, const double *b, long n)
703 {
704 __m512d avec0, avec1, bvec;
705
706 __m512d acc00, acc01, acc02, acc03;
707 __m512d acc10, acc11, acc12, acc13;
708
709
710
711 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
712 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
713 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
714 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
715
716 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
717 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
718 acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
719 acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
720
721 for (long i = 0; i < n; i++) {
722 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
723 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
724
725 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
726 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
727
728 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
729 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
730
731 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
732 MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
733
734 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
735 MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
736 }
737
738
739 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
740 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
741 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
742 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
743
744 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
745 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
746 _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
747 _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
748
749 }
750
751
752 static
753 void muladd3_by_32(double *x, const double *a, const double *b, long n)
754 {
755 __m512d avec0, avec1, avec2, bvec;
756
757 __m512d acc00, acc01, acc02, acc03;
758 __m512d acc10, acc11, acc12, acc13;
759 __m512d acc20, acc21, acc22, acc23;
760
761
762
763 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
764 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
765 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
766 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
767
768 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
769 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
770 acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
771 acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
772
773 acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
774 acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
775 acc22=_mm512_load_pd(x + 2*8 + 2*MAT_BLK_SZ);
776 acc23=_mm512_load_pd(x + 3*8 + 2*MAT_BLK_SZ);
777
778 for (long i = 0; i < n; i++) {
779 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
780 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
781 avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
782
783 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
784 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
785 MUL_ADD512(acc20, avec2, bvec);
786
787 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
788 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
789 MUL_ADD512(acc21, avec2, bvec);
790
791 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
792 MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
793 MUL_ADD512(acc22, avec2, bvec);
794
795 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
796 MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
797 MUL_ADD512(acc23, avec2, bvec);
798 }
799
800 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
801 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
802 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
803 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
804
805 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
806 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
807 _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
808 _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
809
810 _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
811 _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
812 _mm512_store_pd(x + 2*8 + 2*MAT_BLK_SZ, acc22);
813 _mm512_store_pd(x + 3*8 + 2*MAT_BLK_SZ, acc23);
814
815
816 }
817
818
819 static
820 void muladd1_by_16(double *x, const double *a, const double *b, long n)
821 {
822 __m512d avec0, bvec;
823
824 __m512d acc00, acc01;
825
826
827
828 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
829 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
830
831 for (long i = 0; i < n; i++) {
832 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
833
834 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
835 MUL_ADD512(acc00, avec0, bvec);
836
837 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
838 MUL_ADD512(acc01, avec0, bvec);
839 }
840
841
842 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
843 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
844
845 }
846
847 static
848 void muladd2_by_16(double *x, const double *a, const double *b, long n)
849 {
850 __m512d avec0, avec1, bvec;
851
852 __m512d acc00, acc01;
853 __m512d acc10, acc11;
854
855
856
857 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
858 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
859
860 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
861 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
862
863 for (long i = 0; i < n; i++) {
864 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
865 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
866
867 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
868 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
869
870 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
871 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
872 }
873
874
875 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
876 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
877
878 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
879 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
880 }
881
882
883 static
884 void muladd3_by_16(double *x, const double *a, const double *b, long n)
885 {
886 __m512d avec0, avec1, avec2, bvec;
887
888 __m512d acc00, acc01;
889 __m512d acc10, acc11;
890 __m512d acc20, acc21;
891
892
893
894 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
895 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
896
897 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
898 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
899
900 acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
901 acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
902
903
904 for (long i = 0; i < n; i++) {
905 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
906 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
907 avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
908
909 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
910 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
911 MUL_ADD512(acc20, avec2, bvec);
912
913 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
914 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
915 MUL_ADD512(acc21, avec2, bvec);
916 }
917
918
919 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
920 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
921
922 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
923 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
924
925 _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
926 _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
927
928 }
929
930
931
932 #else
646933
647934 static
648935 void muladd1_by_32(double *x, const double *a, const double *b, long n)
686973 }
687974
688975 static
976 void muladd2_by_32(double *x, const double *a, const double *b, long n)
977 {
978 __m256d avec0, avec1, bvec;
979 __m256d acc00, acc01, acc02, acc03;
980 __m256d acc10, acc11, acc12, acc13;
981
982
983 // round 0
984
985 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
986 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
987 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
988 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
989
990 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
991 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
992 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
993 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
994
995 for (long i = 0; i < n; i++) {
996 avec0 = _mm256_broadcast_sd(&a[i]);
997 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
998
999 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1000 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1001 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1002 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1003 }
1004
1005
1006 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1007 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1008 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1009 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1010
1011 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1012 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1013 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1014 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1015
1016 // round 1
1017
1018 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1019 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1020 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1021 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1022
1023 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1024 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1025 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1026 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1027
1028 for (long i = 0; i < n; i++) {
1029 avec0 = _mm256_broadcast_sd(&a[i]);
1030 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1031
1032 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1033 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1034 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1035 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1036 }
1037
1038
1039 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1040 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1041 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1042 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1043
1044 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1045 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1046 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1047 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1048
1049 }
1050
1051 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
1052 // it could be faster on AVX2/FMA, where there should be enough registers
1053 static
1054 void muladd3_by_32(double *x, const double *a, const double *b, long n)
1055 {
1056 __m256d avec0, avec1, avec2, bvec;
1057 __m256d acc00, acc01, acc02, acc03;
1058 __m256d acc10, acc11, acc12, acc13;
1059 __m256d acc20, acc21, acc22, acc23;
1060
1061
1062 // round 0
1063
1064 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1065 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1066 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1067 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1068
1069 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1070 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1071 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1072 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1073
1074 acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
1075 acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
1076 acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
1077 acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
1078
1079 for (long i = 0; i < n; i++) {
1080 avec0 = _mm256_broadcast_sd(&a[i]);
1081 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1082 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1083
1084 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1085 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1086 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1087 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1088 }
1089
1090
1091 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1092 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1093 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1094 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1095
1096 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1097 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1098 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1099 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1100
1101 _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1102 _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1103 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1104 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1105
1106 // round 1
1107
1108 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1109 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1110 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1111 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1112
1113 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1114 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1115 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1116 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1117
1118 acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
1119 acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
1120 acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
1121 acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
1122
1123 for (long i = 0; i < n; i++) {
1124 avec0 = _mm256_broadcast_sd(&a[i]);
1125 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1126 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1127
1128 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1129 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1130 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1131 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1132 }
1133
1134
1135 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1136 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1137 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1138 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1139
1140 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1141 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1142 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1143 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1144
1145 _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
1146 _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
1147 _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
1148 _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
1149
1150 }
1151
1152 static
6891153 void muladd1_by_16(double *x, const double *a, const double *b, long n)
6901154 {
6911155 __m256d avec, bvec;
7161180 }
7171181
7181182
719 // experiment: process two rows at a time
1183
7201184 static
721 void muladd2_by_32(double *x, const double *a, const double *b, long n)
1185 void muladd2_by_16(double *x, const double *a, const double *b, long n)
7221186 {
7231187 __m256d avec0, avec1, bvec;
7241188 __m256d acc00, acc01, acc02, acc03;
7581222 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
7591223 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
7601224
761 // round 1
762
763 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
764 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
765 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
766 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
767
768 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
769 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
770 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
771 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
772
773 for (long i = 0; i < n; i++) {
774 avec0 = _mm256_broadcast_sd(&a[i]);
775 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
776
777 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
778 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
779 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
780 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
781 }
782
783
784 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
785 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
786 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
787 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
788
789 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
790 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
791 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
792 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
793
794 }
795
796
797 // experiment: process three rows at a time
798 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
799 // it could be faster on AVX2/FMA, where there should be enough registers
1225 }
1226
8001227
8011228 static
802 void muladd3_by_32(double *x, const double *a, const double *b, long n)
1229 void muladd3_by_16(double *x, const double *a, const double *b, long n)
8031230 {
8041231 __m256d avec0, avec1, avec2, bvec;
8051232 __m256d acc00, acc01, acc02, acc03;
8511278 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
8521279 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
8531280
854 // round 1
855
856 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
857 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
858 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
859 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
860
861 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
862 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
863 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
864 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
865
866 acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
867 acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
868 acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
869 acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
870
871 for (long i = 0; i < n; i++) {
872 avec0 = _mm256_broadcast_sd(&a[i]);
873 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
874 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
875
876 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
877 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
878 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
879 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
880 }
881
882
883 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
884 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
885 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
886 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
887
888 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
889 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
890 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
891 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
892
893 _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
894 _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
895 _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
896 _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
897
898 }
899
900 static
901 void muladd2_by_16(double *x, const double *a, const double *b, long n)
902 {
903 __m256d avec0, avec1, bvec;
904 __m256d acc00, acc01, acc02, acc03;
905 __m256d acc10, acc11, acc12, acc13;
906
907
908 // round 0
909
910 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
911 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
912 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
913 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
914
915 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
916 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
917 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
918 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
919
920 for (long i = 0; i < n; i++) {
921 avec0 = _mm256_broadcast_sd(&a[i]);
922 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
923
924 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
925 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
926 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
927 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
928 }
929
930
931 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
932 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
933 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
934 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
935
936 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
937 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
938 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
939 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
940
941 }
942
943 static
944 void muladd3_by_16(double *x, const double *a, const double *b, long n)
945 {
946 __m256d avec0, avec1, avec2, bvec;
947 __m256d acc00, acc01, acc02, acc03;
948 __m256d acc10, acc11, acc12, acc13;
949 __m256d acc20, acc21, acc22, acc23;
950
951
952 // round 0
953
954 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
955 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
956 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
957 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
958
959 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
960 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
961 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
962 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
963
964 acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
965 acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
966 acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
967 acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
968
969 for (long i = 0; i < n; i++) {
970 avec0 = _mm256_broadcast_sd(&a[i]);
971 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
972 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
973
974 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
975 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
976 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
977 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
978 }
979
980
981 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
982 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
983 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
984 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
985
986 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
987 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
988 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
989 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
990
991 _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
992 _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
993 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
994 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
995
996 }
1281 }
1282
1283
1284
1285
1286 #endif
1287
1288
1289
9971290
9981291 static inline
9991292 void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
10001293 {
10011294 long i = first;
1002 #ifdef NTL_HAVE_FMA
1003 // processing three rows at a time is faster
1295 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
1296 // process three rows at a time
10041297 for (; i <= last-3; i+=3)
10051298 muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
10061299 for (; i < last; i++)
10191312 void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
10201313 {
10211314 long i = first;
1022 #ifdef NTL_HAVE_FMA
1315 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
10231316 // processing three rows at a time is faster
10241317 for (; i <= last-3; i+=3)
10251318 muladd3_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
34163709 // multiply row k by pivot_inv
34173710 long t1 = pivot_inv;
34183711 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3419 long * NTL_RESTRICT y = &M[k][0];
3712 long *y = &M[k][0];
34203713 for (long j = 0; j < n; j++)
34213714 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
34223715
34293722 NTL_IMPORT(p)
34303723 NTL_IMPORT(n)
34313724 NTL_IMPORT(k)
3432 long * NTL_RESTRICT y = &M[k][0];
3725 long *y = &M[k][0];
34333726 for (long i = first; i < last; i++) {
34343727 if (i == k) continue; // skip row k
34353728
3436 long * NTL_RESTRICT x = &M[i][0];
3729 long *x = &M[i][0];
34373730 long t1 = x[k];
34383731 t1 = NegateMod(t1, p);
34393732 x[k] = 0;
34593752 // pivot colums, using reverse swap sequence
34603753
34613754 for (long i = 0; i < n; i++) {
3462 long * NTL_RESTRICT x = &M[i][0];
3755 long *x = &M[i][0];
34633756
34643757 for (long k = n-1; k >= 0; k--) {
34653758 long pos = P[k];
35663859 // multiply row k by pivot_inv
35673860 long t1 = pivot_inv;
35683861 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3569 unsigned long * NTL_RESTRICT y = &M[k][0];
3862 unsigned long *y = &M[k][0];
35703863 for (long j = 0; j < n; j++) {
35713864 long t2 = rem(y[j], p, red_struct);
35723865 y[j] = MulModPrecon(t2, t1, p, t1pinv);
35813874 NTL_IMPORT(n)
35823875 NTL_IMPORT(k)
35833876 NTL_IMPORT(red_struct)
3584 unsigned long * NTL_RESTRICT y = &M[k][0];
3877 unsigned long *y = &M[k][0];
35853878 if (cleanup) {
35863879 for (long i = first; i < last; i++) {
35873880 if (i == k) continue;
35883881 // skip row k: the data won't change, but it
35893882 // technically is a race condition in a multi-theaded
3590 // execution, and it would violate the "restrict"
3591 // contract
3592
3593 unsigned long * NTL_RESTRICT x = &M[i][0];
3883 // execution
3884
3885 unsigned long *x = &M[i][0];
35943886 for (long j = 0; j < n; j++) {
35953887 x[j] = rem(x[j], p, red_struct);
35963888 }
36013893 for (long i = first; i < last; i++) {
36023894 if (i == k) continue; // skip row k
36033895
3604 unsigned long * NTL_RESTRICT x = &M[i][0];
3896 unsigned long *x = &M[i][0];
36053897 long t1 = rem(x[k], p, red_struct);
36063898 t1 = NegateMod(t1, p);
36073899 x[k] = 0;
36363928 // pivot colums, using reverse swap sequence
36373929
36383930 for (long i = 0; i < n; i++) {
3639 unsigned long * NTL_RESTRICT x = &M[i][0];
3931 unsigned long *x = &M[i][0];
36403932
36413933 for (long k = n-1; k >= 0; k--) {
36423934 long pos = P[k];
37444036 // multiply row k by pivot_inv
37454037 long t1 = pivot_inv;
37464038 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3747 double * NTL_RESTRICT y = &M[k][0];
4039 double *y = &M[k][0];
37484040 for (long j = 0; j < n; j++) {
37494041 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
37504042 y[j] = MulModPrecon(t2, t1, p, t1pinv);
37594051 NTL_IMPORT(n)
37604052 NTL_IMPORT(k)
37614053 NTL_IMPORT(red_struct)
3762 double * NTL_RESTRICT y = &M[k][0];
4054 double *y = &M[k][0];
37634055 if (cleanup) {
37644056 for (long i = first; i < last; i++) {
37654057 if (i == k) continue;
37664058 // skip row k: the data won't change, but it
37674059 // technically is a race condition in a multi-theaded
3768 // execution, and it would violate the "restrict"
3769 // contract
3770
3771 double * NTL_RESTRICT x = &M[i][0];
4060 // execution
4061
4062 double *x = &M[i][0];
37724063 for (long j = 0; j < n; j++) {
37734064 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
37744065 }
37794070 for (long i = first; i < last; i++) {
37804071 if (i == k) continue; // skip row k
37814072
3782 double * NTL_RESTRICT x = &M[i][0];
4073 double *x = &M[i][0];
37834074 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
37844075 t1 = NegateMod(t1, p);
37854076 x[k] = 0;
38024093 // pivot colums, using reverse swap sequence
38034094
38044095 for (long i = 0; i < n; i++) {
3805 double * NTL_RESTRICT x = &M[i][0];
4096 double *x = &M[i][0];
38064097
38074098 for (long k = n-1; k >= 0; k--) {
38084099 long pos = P[k];
39014192 }
39024193
39034194 red_count = red_count-MAT_BLK_SZ;
3904 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
4195 double *kpanelp = &M[kpanel][0];
39054196
39064197 if (cleanup) {
39074198 for (long r = 0; r < n*MAT_BLK_SZ; r++)
39294220 return;
39304221 }
39314222
3932 double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
4223 double *y = &kpanelp[k*MAT_BLK_SZ];
39334224 if (k != pos) {
39344225 // swap rows pos and k
3935 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
4226 double *x = &kpanelp[pos*MAT_BLK_SZ];
39364227 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
39374228
39384229 det = NegateMod(det, p);
39574248 for (long i = 0; i < n; i++) {
39584249 if (i == k) continue; // skip row k
39594250
3960 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
4251 double *x = &kpanelp[i*MAT_BLK_SZ];
39614252 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
39624253 t1 = NegateMod(t1, p);
39634254 x[k-kk] = 0;
39994290 for (long jpanel = first; jpanel < last; jpanel++) {
40004291 if (jpanel == kpanel) continue;
40014292
4002 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
4293 double *jpanelp = &M[jpanel][0];
40034294
40044295 if (cleanup) {
40054296 for (long r = 0; r < n*MAT_BLK_SZ; r++)
40114302 long pos = P[k];
40124303 if (pos != k) {
40134304 // swap rows pos and k
4014 double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4015 double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
4305 double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4306 double *k_p = &jpanelp[k*MAT_BLK_SZ];
40164307 for (long j = 0; j < MAT_BLK_SZ; j++)
40174308 _ntl_swap(pos_p[j], k_p[j]);
40184309 }
40454336 if (pos != k) {
40464337 // swap columns pos and k
40474338
4048 double * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4049 double * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4339 double *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4340 double *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
40504341 for (long i = 0; i < n; i++) {
40514342 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
40524343 }
41544445 }
41554446
41564447 red_count = red_count-MAT_BLK_SZ;
4157 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
4448 unsigned long *kpanelp = &M[kpanel][0];
41584449
41594450 if (cleanup) {
41604451 for (long r = 0; r < n*MAT_BLK_SZ; r++)
41824473 return;
41834474 }
41844475
4185 unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
4476 unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
41864477 if (k != pos) {
41874478 // swap rows pos and k
4188 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
4479 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
41894480 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
41904481
41914482 det = NegateMod(det, p);
42104501 for (long i = 0; i < n; i++) {
42114502 if (i == k) continue; // skip row k
42124503
4213 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
4504 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
42144505 long t1 = rem(x[k-kk], p, red_struct);
42154506 t1 = NegateMod(t1, p);
42164507 x[k-kk] = 0;
42524543 for (long jpanel = first; jpanel < last; jpanel++) {
42534544 if (jpanel == kpanel) continue;
42544545
4255 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
4546 unsigned long *jpanelp = &M[jpanel][0];
42564547
42574548 if (cleanup) {
42584549 for (long r = 0; r < n*MAT_BLK_SZ; r++)
42644555 long pos = P[k];
42654556 if (pos != k) {
42664557 // swap rows pos and k
4267 unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4268 unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
4558 unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4559 unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
42694560 for (long j = 0; j < MAT_BLK_SZ; j++)
42704561 _ntl_swap(pos_p[j], k_p[j]);
42714562 }
43014592 if (pos != k) {
43024593 // swap columns pos and k
43034594
4304 unsigned long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4305 unsigned long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4595 unsigned long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4596 unsigned long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
43064597 for (long i = 0; i < n; i++) {
43074598 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
43084599 }
43974688 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
43984689 long k_max = min(kk+MAT_BLK_SZ, n);
43994690
4400 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
4691 long *kpanelp = &M[kpanel][0];
44014692
44024693
44034694 for (long k = kk; k < k_max; k++) {
44214712 return;
44224713 }
44234714
4424 long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
4715 long *y = &kpanelp[k*MAT_BLK_SZ];
44254716 if (k != pos) {
44264717 // swap rows pos and k
4427 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
4718 long *x = &kpanelp[pos*MAT_BLK_SZ];
44284719 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
44294720
44304721 det = NegateMod(det, p);
44484739 for (long i = 0; i < n; i++) {
44494740 if (i == k) continue; // skip row k
44504741
4451 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
4742 long *x = &kpanelp[i*MAT_BLK_SZ];
44524743 long t1 = x[k-kk];
44534744 t1 = NegateMod(t1, p);
44544745 x[k-kk] = 0;
44874778 for (long jpanel = first; jpanel < last; jpanel++) {
44884779 if (jpanel == kpanel) continue;
44894780
4490 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
4781 long *jpanelp = &M[jpanel][0];
44914782
44924783 // perform swaps
44934784 for (long k = kk; k < k_max; k++) {
44944785 long pos = P[k];
44954786 if (pos != k) {
44964787 // swap rows pos and k
4497 long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4498 long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
4788 long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4789 long *k_p = &jpanelp[k*MAT_BLK_SZ];
44994790 for (long j = 0; j < MAT_BLK_SZ; j++)
45004791 _ntl_swap(pos_p[j], k_p[j]);
45014792 }
45324823 if (pos != k) {
45334824 // swap columns pos and k
45344825
4535 long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4536 long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4826 long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4827 long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
45374828 for (long i = 0; i < n; i++) {
45384829 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
45394830 }
47435034 // multiply row k by pivot_inv
47445035 long t1 = pivot_inv;
47455036 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4746 long * NTL_RESTRICT y = &M[k][0];
5037 long *y = &M[k][0];
47475038 // adjust
47485039 for (long j = k+1; j < n; j++)
47495040 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
47625053 NTL_IMPORT(p)
47635054 NTL_IMPORT(n)
47645055 NTL_IMPORT(k)
4765 long * NTL_RESTRICT y = &M[k][0];
5056 long *y = &M[k][0];
47665057
47675058 // adjust
47685059 for (long ii = first; ii < last; ii++) {
47695060 long i = ii + k+1;
47705061
4771 long * NTL_RESTRICT x = &M[i][0];
5062 long *x = &M[i][0];
47725063 long t1 = x[k];
47735064 t1 = NegateMod(t1, p);
47745065 // adjust // x[k] = 0;
49285219 // multiply row k by pivot_inv
49295220 long t1 = pivot_inv;
49305221 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4931 unsigned long * NTL_RESTRICT y = &M[k][0];
5222 unsigned long *y = &M[k][0];
49325223 for (long j = k+1; j < n; j++) {
49335224 long t2 = rem(y[j], p, red_struct);
49345225 y[j] = MulModPrecon(t2, t1, p, t1pinv);
49455236 NTL_IMPORT(n)
49465237 NTL_IMPORT(k)
49475238 NTL_IMPORT(red_struct)
4948 unsigned long * NTL_RESTRICT y = &M[k][0];
5239 unsigned long *y = &M[k][0];
49495240 if (cleanup) {
49505241 for (long ii = first; ii < last; ii++) {
49515242 long i = ii + k+1;
49525243
4953 unsigned long * NTL_RESTRICT x = &M[i][0];
5244 unsigned long *x = &M[i][0];
49545245 for (long j = k+1; j < n; j++) {
49555246 x[j] = rem(x[j], p, red_struct);
49565247 }
49615252 for (long ii = first; ii < last; ii++) {
49625253 long i = ii + k+1;
49635254
4964 unsigned long * NTL_RESTRICT x = &M[i][0];
5255 unsigned long *x = &M[i][0];
49655256 long t1 = rem(x[k], p, red_struct);
49665257 t1 = NegateMod(t1, p);
49675258 if (t1 == 0) continue;
51245415 // multiply row k by pivot_inv
51255416 long t1 = pivot_inv;
51265417 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5127 double * NTL_RESTRICT y = &M[k][0];
5418 double *y = &M[k][0];
51285419 for (long j = k+1; j < n; j++) {
51295420 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
51305421 y[j] = MulModPrecon(t2, t1, p, t1pinv);
51415432 NTL_IMPORT(n)
51425433 NTL_IMPORT(k)
51435434 NTL_IMPORT(red_struct)
5144 double * NTL_RESTRICT y = &M[k][0];
5435 double *y = &M[k][0];
51455436 if (cleanup) {
51465437 for (long ii = first; ii < last; ii++) {
51475438 long i = ii + k+1;
51485439
5149 double * NTL_RESTRICT x = &M[i][0];
5440 double *x = &M[i][0];
51505441 for (long j = k+1; j < n; j++) {
51515442 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
51525443 }
51605451 for (long ii = first; ii < last; ii++) {
51615452 long i = ii + k+1;
51625453
5163 double * NTL_RESTRICT x = &M[i][0];
5454 double *x = &M[i][0];
51645455 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
51655456 t1 = NegateMod(t1, p);
51665457 if (t1 == 0) continue;
53035594 }
53045595
53055596 red_count = red_count-MAT_BLK_SZ;
5306 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
5597 double *kpanelp = &M[kpanel][0];
53075598
53085599 if (cleanup) {
53095600 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
53315622 return;
53325623 }
53335624
5334 double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
5625 double *y = &kpanelp[k*MAT_BLK_SZ];
53355626 if (k != pos) {
53365627 // swap rows pos and k
5337 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
5628 double *x = &kpanelp[pos*MAT_BLK_SZ];
53385629 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
53395630
53405631 det = NegateMod(det, p);
53635654 for (long i = kk; i < n; i++) {
53645655 if (i == k) continue; // skip row k
53655656
5366 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
5657 double *x = &kpanelp[i*MAT_BLK_SZ];
53675658 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
53685659 t1 = NegateMod(t1, p);
53695660 x[k-kk] = 0;
54125703 for (long index = first; index < last; index++) {
54135704 long jpanel = index + kpanel+1;
54145705
5415 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
5706 double *jpanelp = &M[jpanel][0];
54165707
54175708 if (cleanup) {
54185709 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
54245715 long pos = P[k];
54255716 if (pos != k) {
54265717 // swap rows pos and k
5427 double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5428 double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
5718 double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5719 double *k_p = &jpanelp[k*MAT_BLK_SZ];
54295720 for (long j = 0; j < MAT_BLK_SZ; j++)
54305721 _ntl_swap(pos_p[j], k_p[j]);
54315722 }
55745865 }
55755866
55765867 red_count = red_count-MAT_BLK_SZ;
5577 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
5868 unsigned long *kpanelp = &M[kpanel][0];
55785869
55795870 if (cleanup) {
55805871 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
56025893 return;
56035894 }
56045895
5605 unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
5896 unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
56065897 if (k != pos) {
56075898 // swap rows pos and k
5608 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
5899 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
56095900 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
56105901
56115902 det = NegateMod(det, p);
56345925 for (long i = kk; i < n; i++) {
56355926 if (i == k) continue; // skip row k
56365927
5637 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
5928 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
56385929 long t1 = rem(x[k-kk], p, red_struct);
56395930 t1 = NegateMod(t1, p);
56405931 x[k-kk] = 0;
56825973 for (long index = first; index < last; index++) {
56835974 long jpanel = index + kpanel+1;
56845975
5685 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
5976 unsigned long *jpanelp = &M[jpanel][0];
56865977
56875978 if (cleanup) {
56885979 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
56945985 long pos = P[k];
56955986 if (pos != k) {
56965987 // swap rows pos and k
5697 unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5698 unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
5988 unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5989 unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
56995990 for (long j = 0; j < MAT_BLK_SZ; j++)
57005991 _ntl_swap(pos_p[j], k_p[j]);
57015992 }
58296120 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
58306121 long k_max = min(kk+MAT_BLK_SZ, n);
58316122
5832 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
6123 long *kpanelp = &M[kpanel][0];
58336124
58346125 for (long k = kk; k < k_max; k++) {
58356126
58526143 return;
58536144 }
58546145
5855 long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
6146 long *y = &kpanelp[k*MAT_BLK_SZ];
58566147 if (k != pos) {
58576148 // swap rows pos and k
5858 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
6149 long *x = &kpanelp[pos*MAT_BLK_SZ];
58596150 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
58606151
58616152 det = NegateMod(det, p);
58836174 for (long i = kk; i < n; i++) {
58846175 if (i == k) continue; // skip row k
58856176
5886 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
6177 long *x = &kpanelp[i*MAT_BLK_SZ];
58876178 long t1 = x[k-kk];
58886179 t1 = NegateMod(t1, p);
58896180 x[k-kk] = 0;
59286219 for (long index = first; index < last; index++) {
59296220 long jpanel = index + kpanel+1;
59306221
5931 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
6222 long *jpanelp = &M[jpanel][0];
59326223
59336224 // perform swaps
59346225 for (long k = kk; k < k_max; k++) {
59356226 long pos = P[k];
59366227 if (pos != k) {
59376228 // swap rows pos and k
5938 long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5939 long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
6229 long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
6230 long *k_p = &jpanelp[k*MAT_BLK_SZ];
59406231 for (long j = 0; j < MAT_BLK_SZ; j++)
59416232 _ntl_swap(pos_p[j], k_p[j]);
59426233 }
61816472 NTL_IMPORT(n)
61826473 NTL_IMPORT(k)
61836474 NTL_IMPORT(r)
6184 long * NTL_RESTRICT y = &M[r][0];
6475 long *y = &M[r][0];
61856476
61866477 for (long ii = first; ii < last; ii++) {
61876478 long i = ii + r+1;
61886479
6189 long * NTL_RESTRICT x = &M[i][0];
6480 long *x = &M[i][0];
61906481 long t1 = x[k];
61916482 t1 = MulMod(t1, pivot_inv, p);
61926483 t1 = NegateMod(t1, p);
63346625 static inline
63356626 void SwapOneRow(double *panelp, long i, long pos)
63366627 {
6337 double * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6338 double * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
6628 double *pos_p = &panelp[pos*MAT_BLK_SZ];
6629 double *i_p = &panelp[i*MAT_BLK_SZ];
63396630 for (long j = 0; j < MAT_BLK_SZ; j++)
63406631 _ntl_swap(pos_p[j], i_p[j]);
63416632 }
64166707
64176708 AlignedArray<double> aux_panel_store;
64186709 aux_panel_store.SetLength(n*MAT_BLK_SZ);
6419 double * NTL_RESTRICT aux_panel = &aux_panel_store[0];
6710 double *aux_panel = &aux_panel_store[0];
64206711
64216712
64226713 AlignedArray<double> buf_store1;
64826773 kpanel++;
64836774 }
64846775
6485 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
6776 double *kpanelp = &M[kpanel][0];
64866777
64876778 if (k == kk) { // a fresh kpanel -- special processing
64886779
65306821 continue;
65316822 }
65326823
6533 double * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
6534 double * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
6824 double *y = &kpanelp[r*MAT_BLK_SZ];
6825 double *y1 = &aux_panel[r*MAT_BLK_SZ];
65356826 if (r != pos) {
65366827 // swap rows pos and r
6537 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
6538 double * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
6828 double *x = &kpanelp[pos*MAT_BLK_SZ];
6829 double *x1 = &aux_panel[pos*MAT_BLK_SZ];
65396830
65406831 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
65416832 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
65526843
65536844 // clear column
65546845 for (long i = r+1; i < n; i++) {
6555 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
6556 double * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
6846 double *x = &kpanelp[i*MAT_BLK_SZ];
6847 double *x1 = &aux_panel[i*MAT_BLK_SZ];
65576848 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
65586849 t1 = MulMod(t1, pivot_inv, p);
65596850 t1 = NegateMod(t1, p);
66036894 for (long index = first; index < last; index++) {
66046895 long jpanel = index + kpanel+1;
66056896
6606 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
6897 double *jpanelp = &M[jpanel][0];
66076898
66086899 if (cleanup) {
66096900 for (long h = 0; h < n*MAT_BLK_SZ; h++)
68217112 static inline
68227113 void SwapOneRow(unsigned long *panelp, long i, long pos)
68237114 {
6824 unsigned long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6825 unsigned long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
7115 unsigned long *pos_p = &panelp[pos*MAT_BLK_SZ];
7116 unsigned long *i_p = &panelp[i*MAT_BLK_SZ];
68267117 for (long j = 0; j < MAT_BLK_SZ; j++)
68277118 _ntl_swap(pos_p[j], i_p[j]);
68287119 }
69047195
69057196 UniqueArray<unsigned long> aux_panel_store;
69067197 aux_panel_store.SetLength(n*MAT_BLK_SZ);
6907 unsigned long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
7198 unsigned long *aux_panel = &aux_panel_store[0];
69087199
69097200
69107201 UniqueArray<unsigned long> buf_store1;
69757266 kpanel++;
69767267 }
69777268
6978 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
7269 unsigned long *kpanelp = &M[kpanel][0];
69797270
69807271 if (k == kk) { // a fresh kpanel -- special processing
69817272
70257316 continue;
70267317 }
70277318
7028 unsigned long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
7029 unsigned long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
7319 unsigned long *y = &kpanelp[r*MAT_BLK_SZ];
7320 unsigned long *y1 = &aux_panel[r*MAT_BLK_SZ];
70307321 if (r != pos) {
70317322 // swap rows pos and r
7032 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
7033 unsigned long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
7323 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
7324 unsigned long *x1 = &aux_panel[pos*MAT_BLK_SZ];
70347325
70357326 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
70367327 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
70477338
70487339 // clear column
70497340 for (long i = r+1; i < n; i++) {
7050 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
7051 unsigned long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
7341 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
7342 unsigned long *x1 = &aux_panel[i*MAT_BLK_SZ];
70527343 long t1 = rem(x[k-kk], p, red_struct);
70537344 t1 = MulMod(t1, pivot_inv, p);
70547345 t1 = NegateMod(t1, p);
70987389 for (long index = first; index < last; index++) {
70997390 long jpanel = index + kpanel+1;
71007391
7101 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
7392 unsigned long *jpanelp = &M[jpanel][0];
71027393
71037394 if (cleanup) {
71047395 for (long h = 0; h < n*MAT_BLK_SZ; h++)
73237614 static inline
73247615 void SwapOneRow(long *panelp, long i, long pos)
73257616 {
7326 long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
7327 long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
7617 long *pos_p = &panelp[pos*MAT_BLK_SZ];
7618 long *i_p = &panelp[i*MAT_BLK_SZ];
73287619 for (long j = 0; j < MAT_BLK_SZ; j++)
73297620 _ntl_swap(pos_p[j], i_p[j]);
73307621 }
74087699
74097700 UniqueArray<long> aux_panel_store;
74107701 aux_panel_store.SetLength(n*MAT_BLK_SZ);
7411 long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
7702 long *aux_panel = &aux_panel_store[0];
74127703
74137704
74147705 UniqueArray<long> buf_store1;
74627753 kpanel++;
74637754 }
74647755
7465 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
7756 long *kpanelp = &M[kpanel][0];
74667757
74677758 if (k == kk) { // a fresh kpanel -- special processing
74687759
75047795 continue;
75057796 }
75067797
7507 long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
7508 long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
7798 long *y = &kpanelp[r*MAT_BLK_SZ];
7799 long *y1 = &aux_panel[r*MAT_BLK_SZ];
75097800 if (r != pos) {
75107801 // swap rows pos and r
7511 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
7512 long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
7802 long *x = &kpanelp[pos*MAT_BLK_SZ];
7803 long *x1 = &aux_panel[pos*MAT_BLK_SZ];
75137804
75147805 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
75157806 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
75207811
75217812 // clear column
75227813 for (long i = r+1; i < n; i++) {
7523 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
7524 long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
7814 long *x = &kpanelp[i*MAT_BLK_SZ];
7815 long *x1 = &aux_panel[i*MAT_BLK_SZ];
75257816 long t1 = x[k-kk];
75267817 t1 = MulMod(t1, pivot_inv, p);
75277818 t1 = NegateMod(t1, p);
75687859 for (long index = first; index < last; index++) {
75697860 long jpanel = index + kpanel+1;
75707861
7571 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
7862 long *jpanelp = &M[jpanel][0];
75727863
75737864 // perform swaps
75747865 ApplySwaps(jpanelp, rr, r, P);
138138 lzz_pEX.o lzz_pEXFactoring.o lzz_pX.o lzz_pX1.o lzz_pXCharPoly.o \
139139 lzz_pXFactoring.o mat_GF2.o mat_GF2E.o mat_RR.o mat_ZZ.o mat_ZZ_p.o mat_ZZ_pE.o \
140140 mat_lzz_p.o mat_lzz_pE.o mat_poly_ZZ.o mat_poly_ZZ_p.o mat_poly_lzz_p.o \
141 quad_float.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
141 quad_float.o quad_float1.o tools.o vec_GF2.o vec_GF2E.o vec_RR.o vec_ZZ.o vec_ZZ_p.o \
142142 vec_ZZ_pE.o vec_lzz_p.o vec_lzz_pE.o xdouble.o G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o \
143 G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o
143 G_LLL_RR.o thread.o BasicThreadPool.o MatPrime.o pd_FFT.o
144144
145145 # library source files
146146 SRC=FFT.cpp FacVec.cpp GF2.cpp GF2E.cpp GF2EX.cpp GF2EXFactoring.cpp GF2X.cpp \
152152 lzz_pX.cpp lzz_pX1.cpp lzz_pXCharPoly.cpp lzz_pXFactoring.cpp mat_GF2.cpp \
153153 mat_GF2E.cpp mat_RR.cpp mat_ZZ.cpp mat_ZZ_p.cpp mat_ZZ_pE.cpp mat_lzz_p.cpp \
154154 mat_lzz_pE.cpp mat_poly_ZZ.cpp mat_poly_ZZ_p.cpp mat_poly_lzz_p.cpp \
155 quad_float.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
155 quad_float.cpp quad_float1.cpp tools.cpp vec_GF2.cpp vec_GF2E.cpp vec_RR.cpp vec_ZZ.cpp \
156156 vec_ZZ_p.cpp vec_ZZ_pE.cpp vec_lzz_p.cpp vec_lzz_pE.cpp xdouble.cpp \
157157 G_LLL_FP.cpp G_LLL_QP.cpp G_LLL_XD.cpp G_LLL_RR.cpp thread.cpp \
158 BasicThreadPool.cpp MatPrime.cpp
158 BasicThreadPool.cpp MatPrime.cpp pd_FFT.cpp
159159
160160
161161
162162 # library header files
163 INCL=FFT.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
163 INCL=FFT.h FFT_impl.h FacVec.h GF2.h GF2E.h GF2EX.h GF2EXFactoring.h GF2X.h \
164164 GF2XFactoring.h GF2XVec.h HNF.h ctools.h LLL.h RR.h WordVector.h \
165165 ZZ.h ZZ_limbs.h sp_arith.h ZZVec.h ZZX.h ZZXFactoring.h ZZ_p.h ZZ_pE.h ZZ_pEX.h \
166166 ZZ_pEXFactoring.h ZZ_pX.h ZZ_pXFactoring.h fileio.h lip.h lzz_p.h lzz_pE.h \
174174 vec_vec_GF2E.h vec_vec_RR.h vec_vec_ZZ.h vec_vec_ZZ_p.h vec_vec_ZZ_pE.h \
175175 vec_vec_long.h vec_vec_lzz_p.h vec_vec_lzz_pE.h vec_xdouble.h xdouble.h \
176176 config.h version.h new.h vec_ulong.h vec_vec_ulong.h SmartPtr.h \
177 Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h
177 Lazy.h LazyTable.h thread.h BasicThreadPool.h MatPrime.h PD.h pd_FFT.h
178178
179179
180180
186186
187187
188188 # test source files
189 TS=QuickTest.cpp ZZTest.cpp BerlekampTest.cpp CanZassTest.cpp ZZXFacTest.cpp \
190 MoreFacTest.cpp LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
191 CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp \
192 BitMatTest.cpp ZZ_pEXTest.cpp lzz_pEXTest.cpp Timing.cpp ThreadTest.cpp \
193 ExceptionTest.cpp
189 TS=QuickTest.cpp ZZTest.cpp SSMulTest.cpp ZZ_pXTest.cpp lzz_pXTest.cpp BerlekampTest.cpp \
190 CanZassTest.cpp ZZXFacTest.cpp MoreFacTest.cpp \
191 LLLTest.cpp subset.cpp MatrixTest.cpp mat_lzz_pTest.cpp \
192 CharPolyTest.cpp RRTest.cpp QuadTest.cpp GF2XTest.cpp GF2EXTest.cpp GF2EXGCDTest.cpp \
193 BitMatTest.cpp ZZ_pEXTest.cpp ZZ_pEXGCDTest.cpp lzz_pEXTest.cpp lzz_pEXGCDTest.cpp \
194 Timing.cpp ThreadTest.cpp ExceptionTest.cpp
195
196 # aux source to help compute crossovers
197 CROSS=GF2EXDivCross.cpp GF2EXGCDCross.cpp GF2EXKarCross.cpp GF2EXModCross.cpp
198
194199
195200 # scripts
196201 SCRIPTS=MakeGetTime MakeGetPID MakeCheckFeatures ResetFeatures CopyFeatures \
207212 AUXPROGS = TestGetTime TestGetPID CheckFeatures CheckCompile GenConfigInfo CheckContract \
208213 CheckThreads
209214
210 FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA \
215 FEATURES=ALIGNED_ARRAY BUILTIN_CLZL LL_TYPE SSSE3 AVX PCLMUL AVX2 FMA AVX512F \
211216 COPY_TRAITS1 COPY_TRAITS2 CHRONO_TIME MACOS_TIME POSIX_TIME
212217
213218
262267
263268
264269
265 DOC = $(DFILES) $(HTFILES)
270 DOC = $(DFILES) $(HTFILES) TFT-time.jpg zmulrat.jpg flintrat.jpg
266271
267272
268273 # test program executables
269 PROGS=QuickTest ZZTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest \
274 PROGS=QuickTest ZZTest SSMulTest ZZ_pXTest lzz_pXTest BerlekampTest CanZassTest \
275 ZZXFacTest MoreFacTest LLLTest \
270276 BitMatTest MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest GF2XTest \
271 GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
277 GF2EXTest GF2EXGCDTest subset ZZ_pEXTest ZZ_pEXGCDTest lzz_pEXTest lzz_pEXGCDTest \
278 Timing ThreadTest
272279
273280 # things to save to a tar file
274 SFILES=makefile $(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win \
281 SFILES=$(SRC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) $(CROSS) mach_desc.win \
275282 Poly1TimeTest.cpp Poly2TimeTest.cpp Poly3TimeTest.cpp GF2XTimeTest.cpp \
276283 InitSettings.cpp DispSettings.cpp WizardAux Wizard
277284
368375
369376 quad_float.o: quad_float.cpp
370377 $(LCOMP) $(COMPILE) $(NOCONTRACT) quad_float.cpp
378
379 pd_FFT.o: pd_FFT.cpp
380 $(LCOMP) $(COMPILE) $(NOCONTRACT) pd_FFT.cpp
371381
372382 CheckCompile: CheckCompile.cpp
373383 $(LINK) -o CheckCompile CheckCompile.cpp $(LDLIBS)
524534 rm -f `cat WINDIR`.zip
525535 mv dos `cat WINDIR`
526536 chmod -R a+rX `cat WINDIR`
527 find ./`cat WINDIR` '!' -name '*.gif' -print | zip -l `cat WINDIR` -@
528 find ./`cat WINDIR` -name '*.gif' -print | zip -u `cat WINDIR` -@
537 find ./`cat WINDIR` '!' '(' -name '*.gif' -o -name '*.jpg' ')' -print | zip -l `cat WINDIR` -@
538 find ./`cat WINDIR` -name '*.gif' -o -name '*.jpg' -print | zip -u `cat WINDIR` -@
529539 rm -rf `cat WINDIR`
530540
531541
0
1 // The configure script should define NTL_FP_CONTRACT_OFF
2 // for icc via the NOCONTRACT variable
3 #ifdef NTL_FP_CONTRACT_OFF
4 #pragma fp_contract(off)
5 #endif
6
7
8 #include <NTL/tools.h>
9
10 #ifdef NTL_ENABLE_AVX_FFT
11
12 // The configure script tries to prevent this, but we
13 // double check here. Note that while it is strongly
14 // discouraged, other parts of NTL probably work even with
15 // "fast math"; however, quad_float will definitely break.
16
17 #if (defined(__GNUC__) && __FAST_MATH__)
18 #error "do not compile pd_FFT.cpp with -ffast-math!!"
19 #endif
20
21
22
23 #include <NTL/PD.h>
24 #include <NTL/pd_FFT.h>
25 #include <NTL/FFT_impl.h>
26
27 #if (defined(__GNUC__) && __FAST_MATH__)
28 #error "do not compile pd_FFT.cpp with -ffast-math!!"
29 #endif
30
31 #if (NTL_FMA_DETECTED && !defined(NTL_CONTRACTION_FIXED))
32 #error "contraction not fixed"
33 #endif
34
35
36 NTL_START_IMPL
37
38 #define NTL_CSR_NEAREST (0x00000000)
39 #define NTL_CSR_DOWN (0x00002000)
40 #define NTL_CSR_UP (0x00004000)
41 #define NTL_CSR_TRUNC (0x00006000)
42 #define NTL_CSR_MASK (0x00006000)
43
44 CSRPush::CSRPush()
45 {
46 // save current register value
47 reg = _mm_getcsr();
48 // set rounding mode to "down"
49 _mm_setcsr((reg & ~NTL_CSR_MASK) | NTL_CSR_DOWN);
50 }
51
52 CSRPush::~CSRPush()
53 {
54 _mm_setcsr(reg);
55 }
56
57
58
59 void
60 pd_LazyPrepMulModPrecon_impl(double *bninv, const double *b, double n, long len)
61 {
62 for (long i = 0; i < len; i++) bninv[i] = b[i]/n;
63 }
64
65
66
67 template<class pd> pd
68 pd_LazyReduce1(pd a, double q)
69 {
70 return correct_excess(a, q);
71 }
72
73 template<class pd> pd
74 pd_LazyReduce2(pd a, double q)
75 {
76 return correct_excess(a, 2*q);
77 }
78
79 // inputs in [0, 2*n), output in [0, 4*n)
80 template<class pd> pd
81 pd_LazyAddMod(pd a, pd b, double n)
82 {
83 return a+b;
84 }
85
86 // inputs in [0, 2*n), output in [0, 4*n)
87 template<class pd> pd
88 pd_LazySubMod(pd a, pd b, double n)
89 {
90 return a-b+2*n;
91 }
92
93 // inputs in [0, 2*n), output in [0, 2*n)
94 template<class pd> pd
95 pd_LazyAddMod2(pd a, pd b, double n)
96 {
97 pd r = a+b;
98 return correct_excess(r, 2*n);
99 }
100
101 // inputs in [0, 2*n), output in [0, 2*n)
102 template<class pd> pd
103 pd_LazySubMod2(pd a, pd b, double n)
104 {
105 pd r = a-b;
106 return correct_deficit(r, 2*n);
107 }
108
109 // inputs in [0, 4*n), output in [0, 4*n)
110 template<class pd> pd
111 pd_LazyAddMod4(pd a, pd b, double n)
112 {
113 pd r = a+b;
114 return correct_excess(r, 4*n);
115 }
116
117 // inputs in [0, 4*n), output in [0, 4*n)
118 template<class pd> pd
119 pd_LazySubMod4(pd a, pd b, double n)
120 {
121 pd r = a-b;
122 return correct_deficit(r, 4*n);
123 }
124
125
126 // Input and output in [0, 4*n)
127 template<class pd> pd
128 pd_LazyDoubleMod4(pd a, double n)
129 {
130 return 2 * pd_LazyReduce2(a, n);
131 }
132
133 // Input and output in [0, 2*n)
134 template<class pd> pd
135 pd_LazyDoubleMod2(pd a, double n)
136 {
137 return 2 * pd_LazyReduce1(a, n);
138 }
139
140
141
142 // n in [0,2^50), b in [0,n), a in [0,4*n), bninv = RoundDown(b/n)
143 // returns a*b mod n in [0, 2*n)
144 template<class pd> pd
145 pd_LazyMulModPrecon(pd a, pd b, double n, pd bninv)
146 {
147 pd hi = a*b;
148 pd lo = fused_mulsub(a, b, hi); // hi+lo == a*b (exactly)
149 pd q = fused_muladd(a, bninv, 1L << 52);
150 q -= (1L << 52); // q is the correct quotient, or one too small
151 pd d = fused_negmuladd(q, n, hi); // d == hi - q*n (exactly)
152 pd r = d + lo; // r is the remainder, or the remainder plus n
153
154 return r;
155 }
156
157 // return (a[0] + a[1], a[0] - a[1], a[2] + a[3], a[2] - a[3], ...)
158 // all inputs and outputs in [0, 2*n)
159 template<class pd> pd
160 pd_fwd_butterfly_packed2(pd a, double n)
161 {
162 pd b = swap2(a);
163 pd sum = pd_LazyAddMod(a, b, n);
164 pd diff = pd_LazySubMod(b, a, n);
165 pd res = blend2(sum, diff);
166 res = pd_LazyReduce2(res, n);
167 return res;
168 }
169
170 // return (a[0] + a[2], a[1] + a[3], (a[0] - a[2]), (a[1] - a[3])*root, ...)
171 // all inputs and outputs in [0, 2*n)
172 // it is also assumed that w = (1,1,1,root,...) and wninv = RoundDown(w/n)
173 template<class pd> pd
174 pd_fwd_butterfly_packed4(pd a, pd w, double n, pd wninv)
175 {
176 pd b = swap4(a);
177 pd sum = pd_LazyAddMod(a, b, n);
178 pd diff = pd_LazySubMod(b, a, n);
179 pd res = blend4(sum, diff);
180 res = pd_LazyMulModPrecon(res, w, n, wninv);
181 return res;
182 }
183
184
185 static double
186 pd_LazyPrepMulModPrecon(long b, long n)
187 {
188 return double(b)/double(n);
189 }
190
191
192 //===================================
193
194
195 #define NTL_PD_FFT_THRESH (11)
196
197 #define PDLGSZ NTL_LG2_PDSZ
198 #define PDSZ NTL_PDSZ
199
200 #if (PDSZ == 8)
201 typedef PD<8> pd_full;
202 typedef PD<4> pd_half;
203 typedef PD<2> pd_qrtr;
204 #else
205 typedef PD<4> pd_full;
206 typedef PD<2> pd_half;
207 #endif
208
209 #define PDLD pd_full::load
210
211
212 // this assumes xx0, xx1, w, qinv are pd_half's
213 #define fwd_butterfly_half(xx0, xx1, w, q, wqinv) \
214 do \
215 { \
216 pd_half x0_ = xx0; \
217 pd_half x1_ = xx1; \
218 pd_half t_ = pd_LazySubMod(x0_, x1_, q); \
219 xx0 = pd_LazyAddMod2(x0_, x1_, q); \
220 xx1 = pd_LazyMulModPrecon(t_, w, q, wqinv); \
221 } \
222 while (0)
223
224 // this assumes xx0, xx1, w, qinv are pd_full's
225 #define fwd_butterfly_full(xx0, xx1, w, q, wqinv) \
226 do \
227 { \
228 pd_full x0_ = xx0; \
229 pd_full x1_ = xx1; \
230 pd_full t_ = pd_LazySubMod(x0_, x1_, q); \
231 xx0 = pd_LazyAddMod2(x0_, x1_, q); \
232 xx1 = pd_LazyMulModPrecon(t_, w, q, wqinv); \
233 } \
234 while (0)
235
236 // this assumes xx0_ptr, xx1_ptr, w_ptr, wqinv_ptr are double pointers
237 // which are read/written as pd_full's.
238 // In gcc, restrict keyword will help code gen.
239 #define fwd_butterfly(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
240 do \
241 { \
242 pd_full x0_ = PDLD(xx0_ptr); \
243 pd_full x1_ = PDLD(xx1_ptr); \
244 pd_full w_ = PDLD(w_ptr); \
245 pd_full wqinv_ = PDLD(wqinv_ptr); \
246 pd_full t_ = pd_LazySubMod(x0_, x1_, q); \
247 store(xx0_ptr, pd_LazyAddMod2(x0_, x1_, q)); \
248 store(xx1_ptr, pd_LazyMulModPrecon(t_, w_, q, wqinv_)); \
249 } \
250 while (0)
251
252
253 #if 0
254 #define fwd_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
255 do \
256 { \
257 pd_full xx0_0_ = PDLD(xx0_ptr+0*PDSZ); pd_full xx1_0_ = PDLD(xx1_ptr+0*PDSZ); \
258 pd_full xx0_1_ = PDLD(xx0_ptr+1*PDSZ); pd_full xx1_1_ = PDLD(xx1_ptr+1*PDSZ); \
259 pd_full xx0_2_ = PDLD(xx0_ptr+2*PDSZ); pd_full xx1_2_ = PDLD(xx1_ptr+2*PDSZ); \
260 pd_full xx0_3_ = PDLD(xx0_ptr+3*PDSZ); pd_full xx1_3_ = PDLD(xx1_ptr+3*PDSZ); \
261 fwd_butterfly_full(xx0_0_, xx1_0_, PDLD(w_ptr+0*PDSZ), q, PDLD(wqinv_ptr+0*PDSZ)); \
262 fwd_butterfly_full(xx0_1_, xx1_1_, PDLD(w_ptr+1*PDSZ), q, PDLD(wqinv_ptr+1*PDSZ)); \
263 fwd_butterfly_full(xx0_2_, xx1_2_, PDLD(w_ptr+2*PDSZ), q, PDLD(wqinv_ptr+2*PDSZ)); \
264 fwd_butterfly_full(xx0_3_, xx1_3_, PDLD(w_ptr+3*PDSZ), q, PDLD(wqinv_ptr+3*PDSZ)); \
265 store(xx0_ptr+0*PDSZ, xx0_0_); store(xx1_ptr+0*PDSZ, xx1_0_); \
266 store(xx0_ptr+1*PDSZ, xx0_1_); store(xx1_ptr+1*PDSZ, xx1_1_); \
267 store(xx0_ptr+2*PDSZ, xx0_2_); store(xx1_ptr+2*PDSZ, xx1_2_); \
268 store(xx0_ptr+3*PDSZ, xx0_3_); store(xx1_ptr+3*PDSZ, xx1_3_); \
269 } \
270 while(0)
271 #else
272 #define fwd_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
273 do \
274 { \
275 fwd_butterfly(xx0_ptr+0*PDSZ, xx1_ptr+0*PDSZ, w_ptr+0*PDSZ, q, wqinv_ptr+0*PDSZ); \
276 fwd_butterfly(xx0_ptr+1*PDSZ, xx1_ptr+1*PDSZ, w_ptr+1*PDSZ, q, wqinv_ptr+1*PDSZ); \
277 fwd_butterfly(xx0_ptr+2*PDSZ, xx1_ptr+2*PDSZ, w_ptr+2*PDSZ, q, wqinv_ptr+2*PDSZ); \
278 fwd_butterfly(xx0_ptr+3*PDSZ, xx1_ptr+3*PDSZ, w_ptr+3*PDSZ, q, wqinv_ptr+3*PDSZ); \
279 } \
280 while(0)
281 #endif
282
283
284
285
286 static inline NTL_ALWAYS_INLINE void
287 pd_fft_layer_inner_loop(double* NTL_RESTRICT xp0,
288 double* NTL_RESTRICT xp1,
289 long size,
290 const double* NTL_RESTRICT wtab,
291 const double* NTL_RESTRICT wqinvtab,
292 double q)
293
294 {
295 long j = 0;
296 do {
297 fwd_butterfly_x4(xp0+j, xp1+j, wtab+j, q, wqinvtab+j);
298 j += 4*PDSZ;
299 } while (j < size);
300 }
301
302 // assumes size >= 8*PDSZ
303 static inline NTL_ALWAYS_INLINE void
304 pd_fft_layer(double* xp, long blocks, long size,
305 const double* wtab,
306 const double* wqinvtab,
307 double q)
308 {
309 size /= 2;
310
311 do {
312 pd_fft_layer_inner_loop(xp, xp+size, size, wtab, wqinvtab, q);
313 xp += 2 * size;
314 } while (--blocks != 0);
315 }
316
317
318 // size == 8*PDSZ
319 static inline NTL_ALWAYS_INLINE void
320 pd_fft_layer_size8(double* NTL_RESTRICT xp, long blocks,
321 const double* NTL_RESTRICT wtab,
322 const double* NTL_RESTRICT wqinvtab,
323 double q)
324 {
325 do {
326 fwd_butterfly_x4(xp+0*PDSZ, xp+4*PDSZ, wtab, q, wqinvtab);
327 xp += 8*PDSZ;
328 } while (--blocks != 0);
329 }
330
331 // size == 4*PDSZ
332 static inline NTL_ALWAYS_INLINE void
333 pd_fft_layer_size4(double* NTL_RESTRICT xp, long blocks,
334 const double* NTL_RESTRICT wtab,
335 const double* NTL_RESTRICT wqinvtab,
336 double q)
337 {
338 do {
339 fwd_butterfly(xp+0*PDSZ, xp+2*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
340 fwd_butterfly(xp+1*PDSZ, xp+3*PDSZ, wtab+1*PDSZ, q, wqinvtab+1*PDSZ);
341
342 fwd_butterfly(xp+4*PDSZ, xp+6*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
343 fwd_butterfly(xp+5*PDSZ, xp+7*PDSZ, wtab+1*PDSZ, q, wqinvtab+1*PDSZ);
344
345 xp += 8*PDSZ;
346 blocks -= 2;
347 } while (blocks != 0);
348 }
349
350 // size == 2*PDSZ
351 static inline NTL_ALWAYS_INLINE void
352 pd_fft_layer_size2(double* NTL_RESTRICT xp, long blocks,
353 const double* NTL_RESTRICT wtab,
354 const double* NTL_RESTRICT wqinvtab,
355 double q)
356 {
357 do {
358 fwd_butterfly(xp+0*PDSZ, xp+1*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
359 fwd_butterfly(xp+2*PDSZ, xp+3*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
360 fwd_butterfly(xp+4*PDSZ, xp+5*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
361 fwd_butterfly(xp+6*PDSZ, xp+7*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
362
363 xp += 8*PDSZ;
364 blocks -= 4;
365 } while (blocks != 0);
366 }
367
368 #if (PDSZ == 8)
369 static inline NTL_ALWAYS_INLINE void
370 pd_fft_layer_size1_one_block(double* x,
371 pd_half w8, pd_half w8qinv,
372 pd_full w4, pd_full w4qinv,
373 double q)
374 {
375 pd_half x0 = pd_half::load(x);
376 pd_half x1 = pd_half::load(x+PDSZ/2);
377 fwd_butterfly_half(x0, x1, w8, q, w8qinv);
378 pd_full y = join(x0, x1);
379
380 y = pd_fwd_butterfly_packed4(y, w4, q, w4qinv);
381 y = pd_fwd_butterfly_packed2(y, q);
382
383 store(x, y);
384 }
385
386 // size == PDSZ == 8
387 // processes last three levels, of size 8, 4, and 2.
388 static inline NTL_ALWAYS_INLINE void
389 pd_fft_layer_size1(double* xp, long blocks,
390 const double **w_pp, const double **wqinv_pp,
391 double q)
392 {
393 const double *w8_ptr = *w_pp;
394 const double *w8qinv_ptr = *wqinv_pp;
395
396 const double *w4_ptr = *(w_pp-1);
397 const double *w4qinv_ptr = *(wqinv_pp-1);
398
399 pd_half w8 = pd_half::load(w8_ptr);
400
401 pd_half w8qinv = pd_half::load(w8qinv_ptr);
402
403
404 pd_qrtr w4_qrtr = pd_qrtr::load(w4_ptr);
405 pd_half w4_half = join(w4_qrtr, w4_qrtr);
406 pd_full w4 = join(w4_half, w4_half);
407 w4 = blend4(dup2even(w4), w4);
408
409 pd_qrtr w4qinv_qrtr = pd_qrtr::load(w4qinv_ptr);
410 pd_half w4qinv_half = join(w4qinv_qrtr, w4qinv_qrtr);
411 pd_full w4qinv = join(w4qinv_half, w4qinv_half);
412 w4qinv = blend4(dup2even(w4qinv), w4qinv);
413
414 do {
415 pd_fft_layer_size1_one_block(xp+0*PDSZ, w8, w8qinv, w4, w4qinv, q);
416 pd_fft_layer_size1_one_block(xp+1*PDSZ, w8, w8qinv, w4, w4qinv, q);
417 pd_fft_layer_size1_one_block(xp+2*PDSZ, w8, w8qinv, w4, w4qinv, q);
418 pd_fft_layer_size1_one_block(xp+3*PDSZ, w8, w8qinv, w4, w4qinv, q);
419
420 xp += 4*PDSZ;
421 blocks -= 4;
422 } while (blocks != 0);
423 }
424 #else
425 // PDSZ == 4
426
427 static inline NTL_ALWAYS_INLINE void
428 pd_fft_layer_size1_one_block(double* x,
429 pd_half w4, pd_half w4qinv,
430 double q)
431 {
432 pd_half x0 = pd_half::load(x);
433 pd_half x1 = pd_half::load(x+PDSZ/2);
434 fwd_butterfly_half(x0, x1, w4, q, w4qinv);
435 pd_full y = join(x0, x1);
436
437 y = pd_fwd_butterfly_packed2(y, q);
438
439 store(x, y);
440 }
441
442 // size == PDSZ == 4
443 // processes last two levels, of size 4 and 2.
444 static inline NTL_ALWAYS_INLINE void
445 pd_fft_layer_size1(double* xp, long blocks,
446 const double **w_pp, const double **wqinv_pp,
447 double q)
448 {
449 const double *w4_ptr = *w_pp;
450 const double *w4qinv_ptr = *wqinv_pp;
451
452
453 pd_half w4 = pd_half::load(w4_ptr);
454 pd_half w4qinv = pd_half::load(w4qinv_ptr);
455
456
457 do {
458 pd_fft_layer_size1_one_block(xp+0*PDSZ, w4, w4qinv, q);
459 pd_fft_layer_size1_one_block(xp+1*PDSZ, w4, w4qinv, q);
460 pd_fft_layer_size1_one_block(xp+2*PDSZ, w4, w4qinv, q);
461 pd_fft_layer_size1_one_block(xp+3*PDSZ, w4, w4qinv, q);
462
463 xp += 4*PDSZ;
464 blocks -= 4;
465 } while (blocks != 0);
466 }
467
468 #endif
469
470
471
472
473 void
474 pd_fft_base(double* xp, long lgN, const pd_mod_t& mod)
475 {
476 double q = mod.q;
477 const double** wtab = mod.wtab;
478 const double** wqinvtab = mod.wqinvtab;
479
480 long N = 1L << lgN;
481
482 long j, size, blocks;
483 for (j = lgN, size = N, blocks = 1;
484 size > 8*PDSZ; j--, blocks <<= 1, size >>= 1)
485 pd_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
486
487 pd_fft_layer_size8(xp, blocks, wtab[j], wqinvtab[j], q);
488 j--, blocks <<= 1, size >>= 1;
489
490 pd_fft_layer_size4(xp, blocks, wtab[j], wqinvtab[j], q);
491 j--, blocks <<= 1, size >>= 1;
492
493 pd_fft_layer_size2(xp, blocks, wtab[j], wqinvtab[j], q);
494 j--, blocks <<= 1, size >>= 1;
495
496 pd_fft_layer_size1(xp, blocks, wtab+j, wqinvtab+j, q);
497
498 }
499
500 static inline NTL_ALWAYS_INLINE void
501 pd_move(double *x, const long *a)
502 {
503 pd_full r;
504 loadu(r, a);
505 store(x, r);
506 }
507
508 static inline NTL_ALWAYS_INLINE void
509 pd_move(long *x, const double *a)
510 {
511 pd_full r;
512 load(r, a);
513 storeu(x, r);
514 }
515
516 static inline NTL_ALWAYS_INLINE void
517 pd_reduce1_move(long *x, const double *a, double q)
518 {
519 pd_full r;
520 load(r, a);
521 r = pd_LazyReduce1(r, q);
522 storeu(x, r);
523 }
524
525 static inline NTL_ALWAYS_INLINE void
526 pd_reduce2_move(long *x, const double *a, double q)
527 {
528 pd_full r;
529 load(r, a);
530 r = pd_LazyReduce2(r, q);
531 r = pd_LazyReduce1(r, q);
532 storeu(x, r);
533 }
534
535 static inline NTL_ALWAYS_INLINE void
536 pd_mul_move(long *x, const double *a, pd_full b, double q, pd_full bqinv)
537 {
538 pd_full r;
539 load(r, a);
540 r = pd_LazyMulModPrecon(r, b, q, bqinv);
541 r = pd_LazyReduce1(r, q);
542 storeu(x, r);
543 }
544
545
546
547
548 static
549 void pd_fft_short(double* xp, long yn, long xn, long lgN,
550 const pd_mod_t& mod)
551 {
552 long N = 1L << lgN;
553
554 if (yn == N)
555 {
556 if (xn == N && lgN <= NTL_PD_FFT_THRESH)
557 {
558 // no truncation
559 pd_fft_base(xp, lgN, mod);
560 return;
561 }
562 }
563
564 // divide-and-conquer algorithm
565
566 long half = N >> 1;
567 double q = mod.q;
568
569 if (yn <= half)
570 {
571 if (xn <= half)
572 {
573 pd_fft_short(xp, yn, xn, lgN - 1, mod);
574 }
575 else
576 {
577 xn -= half;
578
579 // (X, Y) -> X + Y
580 for (long j = 0; j < xn; j+=PDSZ)
581 store(xp+j, pd_LazyAddMod2(PDLD(xp+j), PDLD(xp+j+half), q));
582
583 pd_fft_short(xp, yn, half, lgN - 1, mod);
584 }
585 }
586 else
587 {
588 yn -= half;
589
590 double* xp0 = xp;
591 double* xp1 = xp + half;
592 const double* wtab = mod.wtab[lgN];
593 const double* wqinvtab = mod.wqinvtab[lgN];
594
595 if (xn <= half)
596 {
597 // X -> (X, w*X)
598 for (long j = 0; j < xn; j+=PDSZ)
599 store(xp1+j, pd_LazyMulModPrecon(PDLD(xp0+j), PDLD(wtab+j), q, PDLD(wqinvtab+j)));
600
601 pd_fft_short(xp0, half, xn, lgN - 1, mod);
602 pd_fft_short(xp1, yn, xn, lgN - 1, mod);
603 }
604 else
605 {
606 xn -= half;
607
608 // (X, Y) -> (X + Y, w*(X - Y))
609 pd_fft_layer_inner_loop(xp0, xp1, xn, wtab, wqinvtab, q);
610
611 // X -> (X, w*X)
612 for (long j = xn; j < half; j+=PDSZ)
613 store(xp1+j, pd_LazyMulModPrecon(PDLD(xp0+j), PDLD(wtab+j), q, PDLD(wqinvtab+j)));
614
615 pd_fft_short(xp0, half, half, lgN - 1, mod);
616 pd_fft_short(xp1, yn, half, lgN - 1, mod);
617 }
618 }
619 }
620
621
622 void
623 pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
624 long yn, long xn)
625
626 {
627 for (long i = 0; i < xn; i += 4*PDSZ) {
628 pd_move(xp+i+0*PDSZ, a+i+0*PDSZ);
629 pd_move(xp+i+1*PDSZ, a+i+1*PDSZ);
630 pd_move(xp+i+2*PDSZ, a+i+2*PDSZ);
631 pd_move(xp+i+3*PDSZ, a+i+3*PDSZ);
632 }
633
634 pd_fft_short(xp, yn, xn, lgN, mod);
635
636 double q = mod.q;
637 for (long i = 0; i < yn; i += 4*PDSZ) {
638 pd_reduce1_move(A+i+0*PDSZ, xp+i+0*PDSZ, q);
639 pd_reduce1_move(A+i+1*PDSZ, xp+i+1*PDSZ, q);
640 pd_reduce1_move(A+i+2*PDSZ, xp+i+2*PDSZ, q);
641 pd_reduce1_move(A+i+3*PDSZ, xp+i+3*PDSZ, q);
642 }
643 }
644
645
646 void
647 pd_fft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
648 long yn, long xn, double fac)
649
650 {
651 for (long i = 0; i < xn; i += 4*PDSZ) {
652 pd_move(xp+i+0*PDSZ, a+i+0*PDSZ);
653 pd_move(xp+i+1*PDSZ, a+i+1*PDSZ);
654 pd_move(xp+i+2*PDSZ, a+i+2*PDSZ);
655 pd_move(xp+i+3*PDSZ, a+i+3*PDSZ);
656 }
657
658 pd_fft_short(xp, yn, xn, lgN, mod);
659
660 double q = mod.q;
661 double facqinv = fac/q;
662 for (long i = 0; i < yn; i += 4*PDSZ) {
663 pd_mul_move(A+i+0*PDSZ, xp+i+0*PDSZ, fac, q, facqinv);
664 pd_mul_move(A+i+1*PDSZ, xp+i+1*PDSZ, fac, q, facqinv);
665 pd_mul_move(A+i+2*PDSZ, xp+i+2*PDSZ, fac, q, facqinv);
666 pd_mul_move(A+i+3*PDSZ, xp+i+3*PDSZ, fac, q, facqinv);
667 }
668 }
669
670 //================ ifft ==============
671
672 // return (a[0] + a[1], a[0] - a[1], a[2] + a[3], a[2] - a[3], ...)
673 // all inputs and outputs in [0, 4*n)
674 template<class pd> pd
675 pd_inv_butterfly_packed2(pd a, double n)
676 {
677 a = pd_LazyReduce2(a, n);
678 pd b = swap2(a);
679 pd sum = pd_LazyAddMod(a, b, n);
680 pd diff = pd_LazySubMod(b, a, n);
681 pd res = blend2(sum, diff);
682 return res;
683 }
684
685 // return (a[0] + a[2], a[1] + a[3]*root, a[0] - a[2], a[1] - a[3]*root, ...)
686 // all inputs and outputs in [0, 4*n)
687 // it is also assumed that w = (1,1,1,root,...) and wninv = RoundDown(w/n)
688 template<class pd> pd
689 pd_inv_butterfly_packed4(pd a, pd w, double n, pd wninv)
690 {
691 a = pd_LazyMulModPrecon(a, w, n, wninv);
692 pd b = swap4(a);
693 pd sum = pd_LazyAddMod(a, b, n);
694 pd diff = pd_LazySubMod(b, a, n);
695 pd res = blend4(sum, diff);
696 return res;
697 }
698
699 #define inv_butterfly_half(xx0, xx1, w, q, wqinv) \
700 do \
701 { \
702 pd_half x0_ = pd_LazyReduce2(xx0, q); \
703 pd_half x1_ = xx1; \
704 pd_half t_ = pd_LazyMulModPrecon(x1_, w, q, wqinv); \
705 xx0 = pd_LazyAddMod(x0_, t_, q); \
706 xx1 = pd_LazySubMod(x0_, t_, q); \
707 } while (0)
708
709
710 #define inv_butterfly(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
711 do \
712 { \
713 pd_full x0_ = pd_LazyReduce2(PDLD(xx0_ptr), q); \
714 pd_full x1_ = PDLD(xx1_ptr); \
715 pd_full t_ = pd_LazyMulModPrecon(x1_, PDLD(w_ptr), q, PDLD(wqinv_ptr)); \
716 store(xx0_ptr, pd_LazyAddMod(x0_, t_, q)); \
717 store(xx1_ptr, pd_LazySubMod(x0_, t_, q)); \
718 } while (0)
719
720 #define inv_butterfly_x4(xx0_ptr, xx1_ptr, w_ptr, q, wqinv_ptr) \
721 do \
722 { \
723 inv_butterfly(xx0_ptr+0*PDSZ, xx1_ptr+0*PDSZ, w_ptr+0*PDSZ, q, wqinv_ptr+0*PDSZ); \
724 inv_butterfly(xx0_ptr+1*PDSZ, xx1_ptr+1*PDSZ, w_ptr+1*PDSZ, q, wqinv_ptr+1*PDSZ); \
725 inv_butterfly(xx0_ptr+2*PDSZ, xx1_ptr+2*PDSZ, w_ptr+2*PDSZ, q, wqinv_ptr+2*PDSZ); \
726 inv_butterfly(xx0_ptr+3*PDSZ, xx1_ptr+3*PDSZ, w_ptr+3*PDSZ, q, wqinv_ptr+3*PDSZ); \
727 } \
728 while(0)
729
730 static inline NTL_ALWAYS_INLINE void
731 pd_ifft_layer_inner_loop(double* NTL_RESTRICT xp0,
732 double* NTL_RESTRICT xp1,
733 long size,
734 const double* NTL_RESTRICT wtab,
735 const double* NTL_RESTRICT wqinvtab,
736 double q)
737
738 {
739 long j = 0;
740 do {
741 inv_butterfly_x4(xp0+j, xp1+j, wtab+j, q, wqinvtab+j);
742 j += 4*PDSZ;
743 } while (j < size);
744 }
745
746 // assumes size >= 8*PDSZ
747 static inline NTL_ALWAYS_INLINE void
748 pd_ifft_layer(double* xp, long blocks, long size,
749 const double* wtab,
750 const double* wqinvtab,
751 double q)
752 {
753 size /= 2;
754
755 do {
756 pd_ifft_layer_inner_loop(xp, xp+size, size, wtab, wqinvtab, q);
757 xp += 2 * size;
758 } while (--blocks != 0);
759 }
760
761 // size == 8*PDSZ
762 static inline NTL_ALWAYS_INLINE void
763 pd_ifft_layer_size8(double* NTL_RESTRICT xp, long blocks,
764 const double* NTL_RESTRICT wtab,
765 const double* NTL_RESTRICT wqinvtab,
766 double q)
767 {
768 do {
769 inv_butterfly_x4(xp+0*PDSZ, xp+4*PDSZ, wtab, q, wqinvtab);
770 xp += 8*PDSZ;
771 } while (--blocks != 0);
772 }
773
774 // size == 4*PDSZ
775 static inline NTL_ALWAYS_INLINE void
776 pd_ifft_layer_size4(double* NTL_RESTRICT xp, long blocks,
777 const double* NTL_RESTRICT wtab,
778 const double* NTL_RESTRICT wqinvtab,
779 double q)
780 {
781 do {
782 inv_butterfly(xp+0*PDSZ, xp+2*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
783 inv_butterfly(xp+1*PDSZ, xp+3*PDSZ, wtab+1*PDSZ, q, wqinvtab+1*PDSZ);
784
785 inv_butterfly(xp+4*PDSZ, xp+6*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
786 inv_butterfly(xp+5*PDSZ, xp+7*PDSZ, wtab+1*PDSZ, q, wqinvtab+1*PDSZ);
787
788 xp += 8*PDSZ;
789 blocks -= 2;
790 } while (blocks != 0);
791 }
792
793 // size == 2*PDSZ
794 static inline NTL_ALWAYS_INLINE void
795 pd_ifft_layer_size2(double* NTL_RESTRICT xp, long blocks,
796 const double* NTL_RESTRICT wtab,
797 const double* NTL_RESTRICT wqinvtab,
798 double q)
799 {
800 do {
801 inv_butterfly(xp+0*PDSZ, xp+1*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
802 inv_butterfly(xp+2*PDSZ, xp+3*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
803 inv_butterfly(xp+4*PDSZ, xp+5*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
804 inv_butterfly(xp+6*PDSZ, xp+7*PDSZ, wtab+0*PDSZ, q, wqinvtab+0*PDSZ);
805
806 xp += 8*PDSZ;
807 blocks -= 4;
808 } while (blocks != 0);
809 }
810
811 #if (PDSZ == 8)
812 static inline NTL_ALWAYS_INLINE void
813 pd_ifft_layer_size1_one_block(double* x,
814 pd_half w8, pd_half w8qinv,
815 pd_full w4, pd_full w4qinv,
816 double q)
817 {
818 pd_full y = PDLD(x);
819 y = pd_inv_butterfly_packed2(y, q);
820 y = pd_inv_butterfly_packed4(y, w4, q, w4qinv);
821
822 pd_half x0 = get_lo(y);
823 pd_half x1 = get_hi(y);
824 inv_butterfly_half(x0, x1, w8, q, w8qinv);
825
826 store(x, x0);
827 store(x+PDSZ/2, x1);
828 }
829
830 // size == PDSZ == 8
831 // processes last three levels, of size 8, 4, and 2.
832 static inline NTL_ALWAYS_INLINE void
833 pd_ifft_layer_size1(double* xp, long blocks,
834 const double **w_pp, const double **wqinv_pp,
835 double q)
836 {
837 const double *w8_ptr = *w_pp;
838 const double *w8qinv_ptr = *wqinv_pp;
839
840 const double *w4_ptr = *(w_pp-1);
841 const double *w4qinv_ptr = *(wqinv_pp-1);
842
843 pd_half w8 = pd_half::load(w8_ptr);
844
845 pd_half w8qinv = pd_half::load(w8qinv_ptr);
846
847
848 pd_qrtr w4_qrtr = pd_qrtr::load(w4_ptr);
849 pd_half w4_half = join(w4_qrtr, w4_qrtr);
850 pd_full w4 = join(w4_half, w4_half);
851 w4 = blend4(dup2even(w4), w4);
852
853 pd_qrtr w4qinv_qrtr = pd_qrtr::load(w4qinv_ptr);
854 pd_half w4qinv_half = join(w4qinv_qrtr, w4qinv_qrtr);
855 pd_full w4qinv = join(w4qinv_half, w4qinv_half);
856 w4qinv = blend4(dup2even(w4qinv), w4qinv);
857
858 do {
859 pd_ifft_layer_size1_one_block(xp+0*PDSZ, w8, w8qinv, w4, w4qinv, q);
860 pd_ifft_layer_size1_one_block(xp+1*PDSZ, w8, w8qinv, w4, w4qinv, q);
861 pd_ifft_layer_size1_one_block(xp+2*PDSZ, w8, w8qinv, w4, w4qinv, q);
862 pd_ifft_layer_size1_one_block(xp+3*PDSZ, w8, w8qinv, w4, w4qinv, q);
863
864 xp += 4*PDSZ;
865 blocks -= 4;
866 } while (blocks != 0);
867 }
868 #else
869 // PDSZ == 4
870
871 static inline NTL_ALWAYS_INLINE void
872 pd_ifft_layer_size1_one_block(double* x,
873 pd_half w4, pd_half w4qinv,
874 double q)
875 {
876 pd_full y = PDLD(x);
877 y = pd_inv_butterfly_packed2(y, q);
878
879 pd_half x0 = get_lo(y);
880 pd_half x1 = get_hi(y);
881 inv_butterfly_half(x0, x1, w4, q, w4qinv);
882
883 store(x, x0);
884 store(x+PDSZ/2, x1);
885 }
886
887 // size == PDSZ == 4
888 // processes last two levels, of size 4 and 2.
889 static inline NTL_ALWAYS_INLINE void
890 pd_ifft_layer_size1(double* xp, long blocks,
891 const double **w_pp, const double **wqinv_pp,
892 double q)
893 {
894 const double *w4_ptr = *w_pp;
895 const double *w4qinv_ptr = *wqinv_pp;
896
897
898 pd_half w4 = pd_half::load(w4_ptr);
899 pd_half w4qinv = pd_half::load(w4qinv_ptr);
900
901
902 do {
903 pd_ifft_layer_size1_one_block(xp+0*PDSZ, w4, w4qinv, q);
904 pd_ifft_layer_size1_one_block(xp+1*PDSZ, w4, w4qinv, q);
905 pd_ifft_layer_size1_one_block(xp+2*PDSZ, w4, w4qinv, q);
906 pd_ifft_layer_size1_one_block(xp+3*PDSZ, w4, w4qinv, q);
907
908 xp += 4*PDSZ;
909 blocks -= 4;
910 } while (blocks != 0);
911 }
912
913 #endif
914
915 void
916 pd_ifft_base(double* xp, long lgN, const pd_mod_t& mod)
917 {
918 double q = mod.q;
919 const double** wtab = mod.wtab;
920 const double** wqinvtab = mod.wqinvtab;
921
922 long N = 1L << lgN;
923
924 long j=PDLGSZ, size=PDSZ, blocks=N/PDSZ;
925
926 pd_ifft_layer_size1(xp, blocks, wtab+j, wqinvtab+j, q);
927 j++, blocks >>= 1, size <<= 1;
928
929 pd_ifft_layer_size2(xp, blocks, wtab[j], wqinvtab[j], q);
930 j++, blocks >>= 1, size <<= 1;
931
932 pd_ifft_layer_size4(xp, blocks, wtab[j], wqinvtab[j], q);
933 j++, blocks >>= 1, size <<= 1;
934
935 pd_ifft_layer_size8(xp, blocks, wtab[j], wqinvtab[j], q);
936 j++, blocks >>= 1, size <<= 1;
937
938 for (; size <= N; j++, blocks >>= 1, size <<= 1)
939 pd_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
940
941 }
942
943 static void
944 pd_ifft_short2(double* xp, long yn, long lgN, const pd_mod_t& mod);
945
946
947 static void
948 pd_ifft_short1(double* xp, long yn, long lgN, const pd_mod_t& mod)
949
950 // Implements truncated inverse FFT interface, but with xn==yn.
951 // All computations are done in place.
952
953 {
954 long N = 1L << lgN;
955
956 if (yn == N && lgN <= NTL_PD_FFT_THRESH)
957 {
958 // no truncation
959 pd_ifft_base(xp, lgN, mod);
960 return;
961 }
962
963 // divide-and-conquer algorithm
964
965 long half = N >> 1;
966 double q = mod.q;
967
968 if (yn <= half)
969 {
970 // X -> 2X
971 for (long j = 0; j < yn; j+=PDSZ)
972 store(xp+j, pd_LazyDoubleMod4(PDLD(xp+j), q));
973
974 pd_ifft_short1(xp, yn, lgN - 1, mod);
975 }
976 else
977 {
978 double* xp0 = xp;
979 double* xp1 = xp + half;
980
981 pd_ifft_short1(xp0, half, lgN - 1, mod);
982
983 yn -= half;
984
985 if (yn < half) {
986 const double* wtab1 = mod.wtab1[lgN];
987 const double* wqinvtab1 = mod.wqinvtab1[lgN];
988
989 // X -> (2X, w*X)
990 for (long j = yn; j < half; j+=PDSZ)
991 {
992 pd_full x0 = PDLD(xp0+j);
993 store(xp0+j, pd_LazyDoubleMod4(x0, q));
994 store(xp1+j, pd_LazyMulModPrecon(x0, PDLD(wtab1+j), q, PDLD(wqinvtab1+j)));
995 }
996 }
997
998 pd_ifft_short2(xp1, yn, lgN - 1, mod);
999
1000 // (X, Y) -> (X + Y/w, X - Y/w)
1001 pd_ifft_layer_inner_loop(xp0, xp1, yn, mod.wtab[lgN], mod.wqinvtab[lgN], q);
1002 }
1003 }
1004
1005
1006 static void
1007 pd_ifft_short2(double* xp, long yn, long lgN, const pd_mod_t& mod)
1008
1009 // Implements truncated inverse FFT interface, but with xn==N.
1010 // All computations are done in place.
1011
1012 {
1013 long N = 1L << lgN;
1014
1015 if (yn == N && lgN <= NTL_PD_FFT_THRESH)
1016 {
1017 // no truncation
1018 pd_ifft_base(xp, lgN, mod);
1019 return;
1020 }
1021
1022 // divide-and-conquer algorithm
1023
1024 long half = N >> 1;
1025 double q = mod.q;
1026
1027 if (yn <= half)
1028 {
1029 // X -> 2X
1030 for (long j = 0; j < yn; j+=PDSZ)
1031 store(xp+j, pd_LazyDoubleMod4(PDLD(xp+j), q));
1032
1033 // (X, Y) -> X + Y
1034 for (long j = yn; j < half; j+=PDSZ)
1035 store(xp+j, pd_LazyAddMod4(PDLD(xp+j), PDLD(xp+j+half), q));
1036
1037 pd_ifft_short2(xp, yn, lgN - 1, mod);
1038
1039 // (X, Y) -> X - Y
1040 for (long j = 0; j < yn; j+=PDSZ)
1041 store(xp+j, pd_LazySubMod4(PDLD(xp+j), PDLD(xp+j+half), q));
1042 }
1043 else
1044 {
1045 double* xp0 = xp;
1046 double* xp1 = xp + half;
1047
1048 pd_ifft_short1(xp0, half, lgN - 1, mod);
1049
1050 yn -= half;
1051
1052
1053 if (yn < half) {
1054 const double* wtab1 = mod.wtab1[lgN];
1055 const double* wqinvtab1 = mod.wqinvtab1[lgN];
1056
1057 // (X, Y) -> (2X - Y, w*(X - Y))
1058 for (long j = yn; j < half; j+=PDSZ)
1059 {
1060 pd_full x0 = PDLD(xp0+j);
1061 pd_full x1 = PDLD(xp1+j);
1062 pd_full u = pd_LazySubMod4(x0, x1, q);
1063 store(xp0+j, pd_LazyAddMod4(x0, u, q));
1064 store(xp1+j, pd_LazyMulModPrecon(u, PDLD(wtab1+j), q, PDLD(wqinvtab1+j)));
1065 }
1066 }
1067
1068 pd_ifft_short2(xp1, yn, lgN - 1, mod);
1069
1070 // (X, Y) -> (X + Y/w, X - Y/w)
1071 pd_ifft_layer_inner_loop(xp0, xp1, yn, mod.wtab[lgN], mod.wqinvtab[lgN], q);
1072 }
1073 }
1074
1075
1076 void
1077 pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
1078 long yn, double fac)
1079 {
1080 long N = 1L << lgN;
1081
1082 for (long i = 0; i < yn; i += 4*PDSZ) {
1083 pd_move(xp+i+0*PDSZ, a+i+0*PDSZ);
1084 pd_move(xp+i+1*PDSZ, a+i+1*PDSZ);
1085 pd_move(xp+i+2*PDSZ, a+i+2*PDSZ);
1086 pd_move(xp+i+3*PDSZ, a+i+3*PDSZ);
1087 }
1088
1089 pd_ifft_short1(xp, yn, lgN, mod);
1090
1091 double q = mod.q;
1092 double facqinv = fac/q;
1093 for (long i = 0; i < yn; i += 4*PDSZ) {
1094 pd_mul_move(A+i+0*PDSZ, xp+i+0*PDSZ, fac, q, facqinv);
1095 pd_mul_move(A+i+1*PDSZ, xp+i+1*PDSZ, fac, q, facqinv);
1096 pd_mul_move(A+i+2*PDSZ, xp+i+2*PDSZ, fac, q, facqinv);
1097 pd_mul_move(A+i+3*PDSZ, xp+i+3*PDSZ, fac, q, facqinv);
1098 }
1099 }
1100
1101
1102 void
1103 pd_ifft_trunc_impl(long* A, const long* a, double* xp, long lgN, const pd_mod_t& mod,
1104 long yn)
1105 {
1106 long N = 1L << lgN;
1107
1108 for (long i = 0; i < yn; i += 4*PDSZ) {
1109 pd_move(xp+i+0*PDSZ, a+i+0*PDSZ);
1110 pd_move(xp+i+1*PDSZ, a+i+1*PDSZ);
1111 pd_move(xp+i+2*PDSZ, a+i+2*PDSZ);
1112 pd_move(xp+i+3*PDSZ, a+i+3*PDSZ);
1113 }
1114
1115 pd_ifft_short1(xp, yn, lgN, mod);
1116
1117 double q = mod.q;
1118 for (long i = 0; i < yn; i += 4*PDSZ) {
1119 pd_reduce2_move(A+i+0*PDSZ, xp+i+0*PDSZ, q);
1120 pd_reduce2_move(A+i+1*PDSZ, xp+i+1*PDSZ, q);
1121 pd_reduce2_move(A+i+2*PDSZ, xp+i+2*PDSZ, q);
1122 pd_reduce2_move(A+i+3*PDSZ, xp+i+3*PDSZ, q);
1123 }
1124 }
1125
1126 NTL_END_IMPL
1127
1128 #endif
4848 #endif
4949
5050 #include <NTL/quad_float.h>
51 #include <NTL/RR.h>
52
5351 #include <cfloat>
5452
5553
8179
8280
8381 #define START_FIX \
84 volatile unsigned short __old_cw, __new_cw; \
85 asm volatile ("fnstcw %0":"=m" (__old_cw)); \
82 unsigned short __old_cw, __new_cw; \
83 __asm__ volatile ("fnstcw %0":"=m" (__old_cw)::"memory"); \
8684 __new_cw = (__old_cw & ~0x300) | 0x200; \
87 asm volatile ("fldcw %0": :"m" (__new_cw));
88
89
90 #define END_FIX asm volatile ("fldcw %0": :"m" (__old_cw));
85 __asm__ volatile ("fldcw %0"::"m" (__new_cw):"memory");
86
87
88 #define END_FIX __asm__ volatile ("fldcw %0": :"m" (__old_cw));
89
90 // NOTE: "asm volatile" does not guarantee that the asm does
91 // not move. However, the "memory" clobber makes these
92 // memory barriers that cannot move past a load/store
93
94 #define NO_INLINE __attribute__ ((noinline))
95 // to protect against LTO inlining which could break the memory
96 // barriers in START_FIX and END_FIX. I've done some testing
97 // on gcc, clang, and icc. The noinline attribute and the volatile
98 // asm together should ensure that the function gets called
99 // and doesn't get inlined during LTO.
100 // That said, I wouln't really recommend applying LTO to NTL...
101 // and especially to quad_float.cpp.
102
103
104 // NOTE: gcc 8.1 seems a bit buggy: it warns when overloading a function
105 // with different inline atrributes. Earlier versions are fine.
106 // ICC and CLANG are fine.
107
108 // NOTE: starting with gcc 8.1, there is a function attribute called
109 // "noipa" which really does exactly what I want. It would also be useful
110 // for ForceToMem, for example.
91111
92112 #else
93113
94114 #define START_FIX
95115 #define END_FIX
96116
97 #endif
98
99
100 static
101 void normalize(quad_float& z, const double& xhi, const double& xlo)
117 #define NO_INLINE
118
119 #endif
120
121
122
123
124 NO_INLINE void quad_float_normalize(quad_float& z, const double& xhi, const double& xlo)
102125 {
103126 START_FIX
104127 DOUBLE u, v;
112135 END_FIX
113136 }
114137
115
116
117 #if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
118
119
120 quad_float to_quad_float(long n)
121 {
122 DOUBLE xhi, xlo;
123
124 xhi = TrueDouble(n);
125
126 // Because we are assuming 2's compliment integer
127 // arithmetic, the following prevents long(xhi) from overflowing.
128
129 if (n > 0)
130 xlo = TrueDouble(n+long(-xhi));
131 else
132 xlo = TrueDouble(n-long(xhi));
133
134 // renormalize...just to be safe
135
136 quad_float z;
137 normalize(z, xhi, xlo);
138 return z;
139 }
140
141 quad_float to_quad_float(unsigned long n)
142 {
143 DOUBLE xhi, xlo, t;
144
145 const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
146
147 xhi = TrueDouble(n);
148
149 if (xhi >= bnd)
150 t = xhi - bnd;
151 else
152 t = xhi;
153
154 // we use the "to_long" function here to be as portable as possible.
155 long llo = to_long(n - (unsigned long)(t));
156 xlo = TrueDouble(llo);
157
158 quad_float z;
159 normalize(z, xhi, xlo);
160 return z;
161 }
162 #endif
163
164
165 NTL_CHEAP_THREAD_LOCAL
166 long quad_float::oprec = 10;
167
168 void quad_float::SetOutputPrecision(long p)
169 {
170 if (p < 1) p = 1;
171
172 if (NTL_OVERFLOW(p, 1, 0))
173 ResourceError("quad_float: output precision too big");
174
175 oprec = p;
176 }
177
178
179 quad_float operator +(const quad_float& x, const quad_float& y ) {
138 NO_INLINE void quad_float_in_place_add(quad_float& x, const quad_float& y ) {
180139 START_FIX
181140 DOUBLE H, h, T, t, S, s, e, f;
182141 DOUBLE t1;
206165 e = H + h;
207166 f = H - e;
208167 f = f + h;
209 END_FIX
210 return quad_float(e, f);
211 }
212
213 quad_float& operator +=(quad_float& x, const quad_float& y ) {
214 START_FIX
215 DOUBLE H, h, T, t, S, s, e, f;
216 DOUBLE t1;
217
218 S = x.hi + y.hi;
219 T = x.lo + y.lo;
220 e = S - x.hi;
221 f = T - x.lo;
222
223 t1 = S-e;
224 t1 = x.hi-t1;
225 s = y.hi-e;
226 s = s + t1;
227
228 t1 = T-f;
229 t1 = x.lo-t1;
230 t = y.lo-f;
231 t = t + t1;
232
233
234 s = s + T;
235 H = S + s;
236 h = S - H;
237 h = h + s;
238
239 h = h + t;
240 e = H + h;
241 f = H - e;
242 f = f + h;
243168
244169 x.hi = e;
245170 x.lo = f;
246171 END_FIX
247 return x;
248 }
249
250 quad_float operator -(const quad_float& x, const quad_float& y ) {
172 }
173
174
175 NO_INLINE void quad_float_in_place_sub(quad_float& x, const quad_float& y ) {
251176 START_FIX
252177 DOUBLE H, h, T, t, S, s, e, f;
253178 DOUBLE t1, yhi, ylo;
281206 f = H - e;
282207 f = f + h;
283208
284 END_FIX
285 return quad_float(e, f);
286 }
287
288 quad_float& operator -=(quad_float& x, const quad_float& y ) {
289 START_FIX
290 DOUBLE H, h, T, t, S, s, e, f;
291 DOUBLE t1, yhi, ylo;
292
293 yhi = -y.hi;
294 ylo = -y.lo;
295
296 S = x.hi + yhi;
297 T = x.lo + ylo;
298 e = S - x.hi;
299 f = T - x.lo;
300
301 t1 = S-e;
302 t1 = x.hi-t1;
303 s = yhi-e;
304 s = s + t1;
305
306 t1 = T-f;
307 t1 = x.lo-t1;
308 t = ylo-f;
309 t = t + t1;
310
311
312 s = s + T;
313 H = S + s;
314 h = S - H;
315 h = h + s;
316
317 h = h + t;
318 e = H + h;
319 f = H - e;
320 f = f + h;
321
322209 x.hi = e;
323210 x.lo = f;
324211 END_FIX
325 return x;
326 }
327
328 quad_float operator -(const quad_float& x)
212 }
213
214 NO_INLINE void quad_float_in_place_negate(quad_float& x)
329215 {
330216 START_FIX
331217 DOUBLE xhi, xlo, u, v;
341227 v = xhi - u;
342228 v = v + xlo;
343229
344 END_FIX
345 return quad_float(u, v);
230 x.hi = u;
231 x.lo = v;
232 END_FIX
346233 }
347234
348235
349236
350237 #if (NTL_FMA_DETECTED && !defined(NTL_CONTRACTION_FIXED))
351238
352 // The configure script should fix this issue for most
353 // compilers (at least gcc, clang, and icc), but if not,
354 // this is a last ditch effort to fix it (which seems to work).
239
240 // The configure script should ensure that no FMA's are issued
241 // fo most compilers (at least gcc, clang, and icc), but if not,
242 // this is a last ditch effort to fix the problem (which seems to work).
355243
356244 double quad_float_zero = 0;
357245
367255
368256 #endif
369257
370 // NOTE: this is really sick: some compilers will issue FMA
371 // (fused mul add) instructions which will break correctness.
372 // C99 standard is supposed to prevent this across separate
373 // statements, but C++ standard doesn't guarantee much at all.
374 // In any case, gcc does not even implement the C99 standard
375 // correctly. One could disable this by compiling with
376 // an appropriate flag: -mno-fma works for gcc, while -no-fma works
377 // for icc. icc and MSVC++ also support pragmas to do this:
378 // #pragma fp_contract(off). There is also a compiler flag for
379 // gcc: -ffp-contract=off, but -mno-fma seems more widely supported.
380 // These flags work for clang, as well.
381 //
382 // But in any case, I'd rather not mess with getting these flags right.
383 // By calling Protect(a*b), this has the effect of forcing the
384 // compiler to compute a*b + 0. Assuming the compiler otherwise
385 // does not perform any re-association, this should do the trick.
386 // There is a small performance penalty, but it should be reasonable.
387
388
389
390 quad_float operator *(const quad_float& x,const quad_float& y ) {
258
259
260
261 NO_INLINE void quad_float_in_place_mul(quad_float& x,const quad_float& y ) {
391262 START_FIX
392263 DOUBLE hx, tx, hy, ty, C, c;
393264 DOUBLE t1, t2;
422293 tx = C-hx;
423294 tx = tx+c;
424295
425 END_FIX
426 return quad_float(hx, tx);
427 }
428
429 quad_float& operator *=(quad_float& x,const quad_float& y ) {
430 START_FIX
431 DOUBLE hx, tx, hy, ty, C, c;
432 DOUBLE t1, t2;
433
434 C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
435 hx = C-x.hi;
436 c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
437 hx = C-hx;
438 tx = x.hi-hx;
439 hy = c-y.hi;
440 C = Protect(x.hi*y.hi);
441 hy = c-hy;
442 ty = y.hi-hy;
443
444 // c = ((((hx*hy-C)+hx*ty)+tx*hy)+tx*ty)+(x.hi*y.lo+x.lo*y.hi);
445
446 t1 = Protect(hx*hy);
447 t1 = t1-C;
448 t2 = Protect(hx*ty);
449 t1 = t1+t2;
450 t2 = Protect(tx*hy);
451 t1 = t1+t2;
452 t2 = Protect(tx*ty);
453 c = t1+t2;
454 t1 = Protect(x.hi*y.lo);
455 t2 = Protect(x.lo*y.hi);
456 t1 = t1+t2;
457 c = c + t1;
458
459
460 hx = C+c;
461 tx = C-hx;
462 tx = tx+c;
463
464296 x.hi = hx;
465297 x.lo = tx;
466298 END_FIX
467 return x;
468 }
469
470 quad_float operator /(const quad_float& x, const quad_float& y ) {
299 }
300
301
302 NO_INLINE void quad_float_in_place_div(quad_float& x, const quad_float& y ) {
471303 START_FIX
472304 DOUBLE hc, tc, hy, ty, C, c, U, u;
473305 DOUBLE t1;
507339 ty = C-hy;
508340 ty = ty+c;
509341
510 END_FIX
511 return quad_float(hy, ty);
512 }
513
514 quad_float& operator /=(quad_float& x, const quad_float& y ) {
515 START_FIX
516 DOUBLE hc, tc, hy, ty, C, c, U, u;
517 DOUBLE t1;
518
519 C = x.hi/y.hi;
520 c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
521 hc = c-C;
522 u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
523 hc = c-hc;
524 tc = C-hc;
525 hy = u-y.hi;
526 U = Protect(C * y.hi);
527 hy = u-hy;
528 ty = y.hi-hy;
529
530 // u = (((hc*hy-U)+hc*ty)+tc*hy)+tc*ty;
531
532 u = Protect(hc*hy);
533 u = u-U;
534 t1 = Protect(hc*ty);
535 u = u+t1;
536 t1 = Protect(tc*hy);
537 u = u+t1;
538 t1 = Protect(tc*ty);
539 u = u+t1;
540
541 // c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;
542
543 c = x.hi-U;
544 c = c-u;
545 c = c+x.lo;
546 t1 = Protect(C*y.lo);
547 c = c - t1;
548 c = c/y.hi;
549
550 hy = C+c;
551 ty = C-hy;
552 ty = ty+c;
553
554342 x.hi = hy;
555343 x.lo = ty;
556344 END_FIX
557 return x;
558 }
559
560
561 quad_float sqrt(const quad_float& y) {
562 if (y.hi < 0.0)
563 ArithmeticError("quad_float: square root of negative number");
564 if (y.hi == 0.0) return quad_float(0.0,0.0);
565
566 double c;
567 c = sqrt(y.hi);
568 ForceToMem(&c); // This is fairly paranoid, but it doesn't cost too much.
569
570 START_FIX
571
345 }
346
347
348 NO_INLINE void quad_float_in_place_sqrt(quad_float& y, double& c_ref) {
349 START_FIX
350 DOUBLE c = c_ref;
572351 DOUBLE p,q,hx,tx,u,uu,cc;
573352 DOUBLE t1;
574353
596375 hx = c+cc;
597376 tx = c-hx;
598377 tx = tx+cc;
599 END_FIX
600 return quad_float(hx, tx);
601 }
602
603
604
605 void power(quad_float& z, const quad_float& a, long e)
378
379 y.hi = hx;
380 y.lo = tx;
381 END_FIX
382 }
383
384
385 NO_INLINE void quad_float_PrecisionOK(long& res, const double& one)
606386 {
607 quad_float res, u;
608 unsigned long k;
609
610 if (e < 0)
611 k = -((unsigned long) e);
612 else
613 k = e;
614
615 res = 1.0;
616 u = a;
617
618 while (k) {
619 if (k & 1)
620 res = res * u;
621
622 k = k >> 1;
623 if (k)
624 u = u * u;
625 }
626
627 if (e < 0)
628 z = 1.0/res;
629 else
630 z = res;
631 }
632
633
634 void power2(quad_float& z, long e)
635 {
636 z.hi = _ntl_ldexp(1.0, e);
637 z.lo = 0;
638 }
639
640
641 long to_long(const quad_float& x)
642 {
643 double fhi, flo;
644
645 fhi = floor(x.hi);
646
647 if (fhi == x.hi)
648 flo = floor(x.lo);
649 else
650 flo = 0;
651
652 // the following code helps to prevent unnecessary integer overflow,
653 // and guarantees that to_long(to_quad_float(a)) == a, for all long a,
654 // provided long's are not too wide.
655
656 if (fhi > 0)
657 return long(flo) - long(-fhi);
658 else
659 return long(fhi) + long(flo);
660 }
661
662
663
664 // This version of ZZ to quad_float coversion relies on the
665 // precise rounding rules implemented by the ZZ to double conversion.
666
667
668 void conv(quad_float& z, const ZZ& a)
669 {
670 double xhi, xlo;
671
672 conv(xhi, a);
673
674 if (!IsFinite(&xhi)) {
675 z.hi = xhi;
676 z.lo = 0;
677 return;
678 }
679
680 NTL_ZZRegister(t);
681
682 conv(t, xhi);
683 sub(t, a, t);
684
685 conv(xlo, t);
686
687 normalize(z, xhi, xlo);
688
689 // The following is just paranoia.
690 if (fabs(z.hi) < NTL_FDOUBLE_PRECISION && z.lo != 0)
691 LogicError("internal error: ZZ to quad_float conversion");
692 }
693
694 void conv(ZZ& z, const quad_float& x)
695 {
696 NTL_ZZRegister(t1);
697 NTL_ZZRegister(t2);
698 NTL_ZZRegister(t3);
699
700 double fhi, flo;
701
702 fhi = floor(x.hi);
703
704 if (fhi == x.hi) {
705 flo = floor(x.lo);
706
707 conv(t1, fhi);
708 conv(t2, flo);
709
710 add(z, t1, t2);
711 }
712 else
713 conv(z, fhi);
714 }
715
716
717
718 ostream& operator<<(ostream& s, const quad_float& a)
719 {
720 quad_float aa = a;
721
722 if (!IsFinite(&aa)) {
723 s << "NaN";
724 return s;
725 }
726
727 RRPush push;
728 RROutputPush opush;
729
730 RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
731 RR::SetOutputPrecision(quad_float::oprec);
732
733 NTL_TLS_LOCAL(RR, t);
734
735 conv(t, a);
736 s << t;
737
738 return s;
739 }
740
741 istream& operator>>(istream& s, quad_float& x)
742 {
743 RRPush push;
744 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
745
746 NTL_TLS_LOCAL(RR, t);
747 NTL_INPUT_CHECK_RET(s, s >> t);
748 conv(x, t);
749
750 return s;
751 }
752
753 void random(quad_float& x)
754 {
755 RRPush push;
756 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
757
758 NTL_TLS_LOCAL(RR, t);
759 random(t);
760 conv(x, t);
761 }
762
763 quad_float random_quad_float()
764 {
765 quad_float x;
766 random(x);
767 return x;
768 }
769
770 long IsFinite(quad_float *x)
771 {
772 return IsFinite(&x->hi) && IsFinite(&x->lo);
773 }
774
775
776 long PrecisionOK()
777 {
778387 START_FIX
779388 long k;
780 DOUBLE l1 = (double)1;
781 DOUBLE lh = 1/(double)2;
389 DOUBLE l1 = one;
390 DOUBLE lh = one/double(2);
782391 DOUBLE epsilon;
783392 DOUBLE fudge, oldfudge;
784393
794403 fudge = l1 + epsilon;
795404 } while (fudge > l1 && fudge < oldfudge);
796405
797 END_FIX
798 return k == NTL_DOUBLE_PRECISION;
799 }
800
801 quad_float floor(const quad_float& x)
802 {
803 double fhi = floor(x.hi);
804
805 if (fhi != x.hi)
806 return quad_float(fhi, 0.0);
807 else {
808 double flo = floor(x.lo);
809 quad_float z;
810 normalize(z, fhi, flo);
811 return z;
812 }
813 }
814
815
816 quad_float ceil(const quad_float& x) {
817 return -floor(-x);
818 }
819
820 quad_float trunc(const quad_float& x) {
821 if (x>=0.0) return floor(x); else return -floor(-x);
822 }
823
824
825
826 long compare(const quad_float& x, const quad_float& y)
827 {
828 if (x.hi > y.hi)
829 return 1;
830 else if (x.hi < y.hi)
831 return -1;
832 else if (x.lo > y.lo)
833 return 1;
834 else if (x.lo < y.lo)
835 return -1;
836 else
837 return 0;
838 }
839
840
841 quad_float fabs(const quad_float& x)
842 { if (x.hi>=0.0) return x; else return -x; }
843
844
845 quad_float ldexp(const quad_float& x, long exp) { // x*2^exp
846 double xhi, xlo;
847 quad_float z;
848
849 xhi = _ntl_ldexp(x.hi, exp);
850 xlo = _ntl_ldexp(x.lo, exp);
851
852 normalize(z, xhi, xlo);
853 return z;
854 }
855
856
857 quad_float exp(const quad_float& x) { // New version 97 Aug 05
858 /*
859 ! Calculate a quadruple-precision exponential
860 ! Method:
861 ! x x.log2(e) nint[x.log2(e)] + frac[x.log2(e)]
862 ! e = 2 = 2
863 !
864 ! iy fy
865 ! = 2 . 2
866 ! Then
867 ! fy y.loge(2)
868 ! 2 = e
869 !
870 ! Now y.loge(2) will be less than 0.3466 in absolute value.
871 ! This is halved and a Pade aproximation is used to approximate e^x over
872 ! the region (-0.1733, +0.1733). This approximation is then squared.
873 */
874 if (x.hi<DBL_MIN_10_EXP*2.302585092994045684017991)
875 return to_quad_float(0.0);
876 if (x.hi>DBL_MAX_10_EXP*2.302585092994045684017991) {
877 ResourceError("exp(quad_float): overflow");
878 }
879
880 static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
881 // GLOBAL (assumes C++11 thread-safe init)
882
883 quad_float y,temp,ysq,sum1,sum2;
884 long iy;
885 y=x/Log2;
886 temp = floor(y+0.5);
887 iy = to_long(temp);
888 y=(y-temp)*Log2;
889 y=ldexp(y,-1L);
890 ysq=y*y;
891 sum1=y*((((ysq+3960.0)*ysq+2162160.0)*ysq+302702400.0)*ysq+8821612800.0);
892 sum2=(((90.0*ysq+110880.0)*ysq+30270240.0)*ysq+2075673600.0)*ysq+17643225600.0;
893 /*
894 ! sum2 + sum1 2.sum1
895 ! Now approximation = ----------- = 1 + ----------- = 1 + 2.temp
896 ! sum2 - sum1 sum2 - sum1
897 !
898 ! Then (1 + 2.temp)^2 = 4.temp.(1 + temp) + 1
899 */
900 temp=sum1/(sum2-sum1);
901 y=temp*(temp+1);
902 y=ldexp(y,2L);
903 return ldexp(y+1,iy);
904 }
905
906 quad_float log(const quad_float& t) { // Newton method. See Bailey, MPFUN
907 if (t.hi <= 0.0) {
908 ArithmeticError("log(quad_float): argument must be positive");
909 }
910 double s1 = log(t.hi);
911 ForceToMem(&s1); // Again, this is fairly paranoid.
912 quad_float s;
913 s = s1;
914 quad_float e;
915 e=exp(s);
916 return s+(t-e)/e; // Newton step
917 }
918
919 long operator> (const quad_float& x, const quad_float& y) {
920 return (x.hi> y.hi) || (x.hi==y.hi && x.lo> y.lo); }
921 long operator>=(const quad_float& x, const quad_float& y) {
922 return (x.hi>y.hi) || (x.hi==y.hi && x.lo>=y.lo); }
923 long operator< (const quad_float& x, const quad_float& y) {
924 return (x.hi< y.hi) || (x.hi==y.hi && x.lo< y.lo); }
925 long operator<=(const quad_float& x, const quad_float& y) {
926 return (x.hi<y.hi) || (x.hi==y.hi && x.lo<=y.lo); }
927 long operator==(const quad_float& x, const quad_float& y)
928 { return x.hi==y.hi && x.lo==y.lo; }
929 long operator!=(const quad_float& x, const quad_float& y)
930 { return x.hi!=y.hi || x.lo!=y.lo; }
406 res = (k == NTL_DOUBLE_PRECISION);
407 END_FIX
408 }
409
410
931411
932412
933413 NTL_END_IMPL
0 #include <NTL/quad_float.h>
1 #include <NTL/RR.h>
2
3 #include <cfloat>
4
5 NTL_START_IMPL
6
7
8 #if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
9
10
11 quad_float to_quad_float(long n)
12 {
13 double xhi, xlo;
14
15 xhi = TrueDouble(n);
16
17 // Because we are assuming 2's compliment integer
18 // arithmetic, the following prevents long(xhi) from overflowing.
19
20 if (n > 0)
21 xlo = TrueDouble(n+long(-xhi));
22 else
23 xlo = TrueDouble(n-long(xhi));
24
25 // renormalize...just to be safe
26
27 quad_float z;
28 quad_float_normalize(z, xhi, xlo);
29 return z;
30 }
31
32 quad_float to_quad_float(unsigned long n)
33 {
34 double xhi, xlo, t;
35
36 const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
37
38 xhi = TrueDouble(n);
39
40 if (xhi >= bnd)
41 t = xhi - bnd;
42 else
43 t = xhi;
44
45 // we use the "to_long" function here to be as portable as possible.
46 long llo = to_long(n - (unsigned long)(t));
47 xlo = TrueDouble(llo);
48
49 quad_float z;
50 quad_float_normalize(z, xhi, xlo);
51 return z;
52 }
53 #endif
54
55
56 NTL_CHEAP_THREAD_LOCAL
57 long quad_float::oprec = 10;
58
59 void quad_float::SetOutputPrecision(long p)
60 {
61 if (p < 1) p = 1;
62
63 if (NTL_OVERFLOW(p, 1, 0))
64 ResourceError("quad_float: output precision too big");
65
66 oprec = p;
67 }
68
69
70
71 void power(quad_float& z, const quad_float& a, long e)
72 {
73 quad_float res, u;
74 unsigned long k;
75
76 if (e < 0)
77 k = -((unsigned long) e);
78 else
79 k = e;
80
81 res = 1.0;
82 u = a;
83
84 while (k) {
85 if (k & 1)
86 res = res * u;
87
88 k = k >> 1;
89 if (k)
90 u = u * u;
91 }
92
93 if (e < 0)
94 z = 1.0/res;
95 else
96 z = res;
97 }
98
99
100 void power2(quad_float& z, long e)
101 {
102 z.hi = _ntl_ldexp(1.0, e);
103 z.lo = 0;
104 }
105
106
107 long to_long(const quad_float& x)
108 {
109 double fhi, flo;
110
111 fhi = floor(x.hi);
112
113 if (fhi == x.hi)
114 flo = floor(x.lo);
115 else
116 flo = 0;
117
118 // the following code helps to prevent unnecessary integer overflow,
119 // and guarantees that to_long(to_quad_float(a)) == a, for all long a,
120 // provided long's are not too wide.
121
122 if (fhi > 0)
123 return long(flo) - long(-fhi);
124 else
125 return long(fhi) + long(flo);
126 }
127
128
129
130 // This version of ZZ to quad_float coversion relies on the
131 // precise rounding rules implemented by the ZZ to double conversion.
132
133
134 void conv(quad_float& z, const ZZ& a)
135 {
136 double xhi, xlo;
137
138 conv(xhi, a);
139
140 if (!IsFinite(&xhi)) {
141 z.hi = xhi;
142 z.lo = 0;
143 return;
144 }
145
146 NTL_ZZRegister(t);
147
148 conv(t, xhi);
149 sub(t, a, t);
150
151 conv(xlo, t);
152
153 quad_float_normalize(z, xhi, xlo);
154 }
155
156 void conv(ZZ& z, const quad_float& x)
157 {
158 NTL_ZZRegister(t1);
159 NTL_ZZRegister(t2);
160 NTL_ZZRegister(t3);
161
162 double fhi, flo;
163
164 fhi = floor(x.hi);
165
166 if (fhi == x.hi) {
167 flo = floor(x.lo);
168
169 conv(t1, fhi);
170 conv(t2, flo);
171
172 add(z, t1, t2);
173 }
174 else
175 conv(z, fhi);
176 }
177
178
179
180 ostream& operator<<(ostream& s, const quad_float& a)
181 {
182 quad_float aa = a;
183
184 if (!IsFinite(&aa)) {
185 s << "NaN";
186 return s;
187 }
188
189 RRPush push;
190 RROutputPush opush;
191
192 RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
193 RR::SetOutputPrecision(quad_float::oprec);
194
195 NTL_TLS_LOCAL(RR, t);
196
197 conv(t, a);
198 s << t;
199
200 return s;
201 }
202
203 istream& operator>>(istream& s, quad_float& x)
204 {
205 RRPush push;
206 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
207
208 NTL_TLS_LOCAL(RR, t);
209 NTL_INPUT_CHECK_RET(s, s >> t);
210 conv(x, t);
211
212 return s;
213 }
214
215 void random(quad_float& x)
216 {
217 RRPush push;
218 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
219
220 NTL_TLS_LOCAL(RR, t);
221 random(t);
222 conv(x, t);
223 }
224
225 quad_float random_quad_float()
226 {
227 quad_float x;
228 random(x);
229 return x;
230 }
231
232 long IsFinite(quad_float *x)
233 {
234 return IsFinite(&x->hi) && IsFinite(&x->lo);
235 }
236
237
238 quad_float floor(const quad_float& x)
239 {
240 double fhi = floor(x.hi);
241
242 if (fhi != x.hi)
243 return quad_float(fhi, 0.0);
244 else {
245 double flo = floor(x.lo);
246 quad_float z;
247 quad_float_normalize(z, fhi, flo);
248 return z;
249 }
250 }
251
252
253 quad_float ceil(const quad_float& x) {
254 return -floor(-x);
255 }
256
257 quad_float trunc(const quad_float& x) {
258 if (x>=0.0) return floor(x); else return -floor(-x);
259 }
260
261
262
263 long compare(const quad_float& x, const quad_float& y)
264 {
265 if (x.hi > y.hi)
266 return 1;
267 else if (x.hi < y.hi)
268 return -1;
269 else if (x.lo > y.lo)
270 return 1;
271 else if (x.lo < y.lo)
272 return -1;
273 else
274 return 0;
275 }
276
277
278 quad_float fabs(const quad_float& x)
279 { if (x.hi>=0.0) return x; else return -x; }
280
281
282 quad_float ldexp(const quad_float& x, long exp) { // x*2^exp
283 double xhi, xlo;
284 quad_float z;
285
286 xhi = _ntl_ldexp(x.hi, exp);
287 xlo = _ntl_ldexp(x.lo, exp);
288
289 quad_float_normalize(z, xhi, xlo);
290 return z;
291 }
292
293
294 quad_float exp(const quad_float& x) { // New version 97 Aug 05
295 /*
296 ! Calculate a quadruple-precision exponential
297 ! Method:
298 ! x x.log2(e) nint[x.log2(e)] + frac[x.log2(e)]
299 ! e = 2 = 2
300 !
301 ! iy fy
302 ! = 2 . 2
303 ! Then
304 ! fy y.loge(2)
305 ! 2 = e
306 !
307 ! Now y.loge(2) will be less than 0.3466 in absolute value.
308 ! This is halved and a Pade aproximation is used to approximate e^x over
309 ! the region (-0.1733, +0.1733). This approximation is then squared.
310 */
311 if (x.hi<DBL_MIN_10_EXP*2.302585092994045684017991)
312 return to_quad_float(0.0);
313 if (x.hi>DBL_MAX_10_EXP*2.302585092994045684017991) {
314 ResourceError("exp(quad_float): overflow");
315 }
316
317 static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
318 // GLOBAL (assumes C++11 thread-safe init)
319
320 quad_float y,temp,ysq,sum1,sum2;
321 long iy;
322 y=x/Log2;
323 temp = floor(y+0.5);
324 iy = to_long(temp);
325 y=(y-temp)*Log2;
326 y=ldexp(y,-1L);
327 ysq=y*y;
328 sum1=y*((((ysq+3960.0)*ysq+2162160.0)*ysq+302702400.0)*ysq+8821612800.0);
329 sum2=(((90.0*ysq+110880.0)*ysq+30270240.0)*ysq+2075673600.0)*ysq+17643225600.0;
330 /*
331 ! sum2 + sum1 2.sum1
332 ! Now approximation = ----------- = 1 + ----------- = 1 + 2.temp
333 ! sum2 - sum1 sum2 - sum1
334 !
335 ! Then (1 + 2.temp)^2 = 4.temp.(1 + temp) + 1
336 */
337 temp=sum1/(sum2-sum1);
338 y=temp*(temp+1);
339 y=ldexp(y,2L);
340 return ldexp(y+1,iy);
341 }
342
343 quad_float log(const quad_float& t) { // Newton method. See Bailey, MPFUN
344 if (t.hi <= 0.0) {
345 ArithmeticError("log(quad_float): argument must be positive");
346 }
347
348 quad_float s = to_quad_float(log(t.hi));
349 // NOTE: in case log yields excess precision, this assumes
350 // that to_quad_float removes it
351
352 quad_float e = exp(s);
353 return s+(t-e)/e; // Newton step
354 }
355
356 quad_float sqrt(const quad_float& y)
357 {
358 if (y.hi < 0.0)
359 ArithmeticError("quad_float: square root of negative number");
360 if (y.hi == 0.0) return quad_float(0.0,0.0);
361
362 double c = TrueDouble(sqrt(y.hi));
363 // NOTE: we call TrueDouble, just in case sqrt yields excess precision
364
365 quad_float yy = y;
366 quad_float_in_place_sqrt(yy, c);
367 return yy;
368 }
369
370
371 long operator> (const quad_float& x, const quad_float& y) {
372 return (x.hi> y.hi) || (x.hi==y.hi && x.lo> y.lo); }
373 long operator>=(const quad_float& x, const quad_float& y) {
374 return (x.hi>y.hi) || (x.hi==y.hi && x.lo>=y.lo); }
375 long operator< (const quad_float& x, const quad_float& y) {
376 return (x.hi< y.hi) || (x.hi==y.hi && x.lo< y.lo); }
377 long operator<=(const quad_float& x, const quad_float& y) {
378 return (x.hi<y.hi) || (x.hi==y.hi && x.lo<=y.lo); }
379 long operator==(const quad_float& x, const quad_float& y)
380 { return x.hi==y.hi && x.lo==y.lo; }
381 long operator!=(const quad_float& x, const quad_float& y)
382 { return x.hi!=y.hi || x.lo!=y.lo; }
383
384
385 NTL_END_IMPL