Codebase list ntl / 55c369b
Import ntl_9.9.0.orig.tar.gz Julien Puydt 7 years ago
238 changed file(s) with 18542 addition(s) and 3786 deletion(s). Raw diff Collapse all Expand all
0 NTL -- a library for doing numbery theory -- version 9.3.0
1 Release date: 2015.7.9
0 NTL -- a library for doing numbery theory -- version 9.9.0
1 Release date: 2016.05.30
22
33 Author: Victor Shoup (victor@shoup.net)
44
0 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1 <html>
2 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/BasicThreadPool.cpp.html</title>
4 <meta name="Generator" content="Vim/7.1">
5 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
6 </head>
7 <body bgcolor="#ffffff" text="#000000"><font face="monospace">
8 <br>
9 <br>
10 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>***********************************************************************</i></font><br>
11 <br>
12 <font color="#0000ed"><i>MODULE: BasicThreadPool</i></font><br>
13 <br>
14 <font color="#0000ed"><i>SUMMARY:</i></font><br>
15 <br>
16 <font color="#0000ed"><i>A simple thread pool class BasicThreadPool, as well as some higher-level macros</i></font><br>
17 <font color="#0000ed"><i>which facilitite simple parallel for loops.</i></font><br>
18 <br>
19 <br>
20 <font color="#0000ed"><i>**************************************************************************</i></font><font color="#0000ed"><i>*/</i></font><br>
21 <br>
22 <br>
23 <font color="#0000ed"><i>// ********************** Simple parallel for loops **************************</i></font><br>
24 <font color="#0000ed"><i>// </i></font><br>
25 <font color="#0000ed"><i>// We begin with a description of the higher-level macros for writing simple</i></font><br>
26 <font color="#0000ed"><i>// parallel for loops.&nbsp;&nbsp;These facilitaties are activated only when NTL is</i></font><br>
27 <font color="#0000ed"><i>// configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).</i></font><br>
28 <font color="#0000ed"><i>// However, code that uses these facilties should still compile and run</i></font><br>
29 <font color="#0000ed"><i>// correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so</i></font><br>
30 <font color="#0000ed"><i>// this is the simplest way to write parallel for loops across a range of</i></font><br>
31 <font color="#0000ed"><i>// compile-time and run-time environments.&nbsp;&nbsp;Note that if NTL_THREADS=on, C++11</i></font><br>
32 <font color="#0000ed"><i>// features are reqired, but when NTL_THREADS=off, these features are not</i></font><br>
33 <font color="#0000ed"><i>// required, so the code should compile on older C++ compilers.</i></font><br>
34 <font color="#0000ed"><i>// </i></font><br>
35 <font color="#0000ed"><i>// Here is a simple recipe for writing parallel for loop.</i></font><br>
36 <font color="#0000ed"><i>// </i></font><br>
37 <font color="#0000ed"><i>// At the start of program execution, your program should execute</i></font><br>
38 <br>
39 &nbsp;&nbsp; SetNumThreads(nt);<br>
40 <br>
41 <font color="#0000ed"><i>// You can choose nt to be any positive integer, but for best results, it</i></font><br>
42 <font color="#0000ed"><i>// should correspond to the number of available cores on your machine.</i></font><br>
43 <font color="#0000ed"><i>// [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does</i></font><br>
44 <font color="#0000ed"><i>// nothing.]</i></font><br>
45 <font color="#0000ed"><i>// </i></font><br>
46 <font color="#0000ed"><i>// Now consider the following routine:</i></font><br>
47 <br>
48 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
49 &nbsp;&nbsp; {<br>
50 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = <font color="#ff8b00">0</font>; i &lt; n; i++)<br>
51 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; mul(x[i], a[i], b[i]);<br>
52 &nbsp;&nbsp; }<br>
53 <br>
54 <font color="#0000ed"><i>// We can parallelize it as follows:</i></font><br>
55 <br>
56 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
57 &nbsp;&nbsp; {<br>
58 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE(n, first, last) <br>
59 <br>
60 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++)<br>
61 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mul(x[i], a[i], b[i]);<br>
62 <br>
63 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE_END<br>
64 &nbsp;&nbsp; }<br>
65 <br>
66 <font color="#0000ed"><i>// NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just &quot;do the right</i></font><br>
67 <font color="#0000ed"><i>// thing&quot;.&nbsp;&nbsp;If there are nt threads available, the interval [0..n) will be</i></font><br>
68 <font color="#0000ed"><i>// partitioned into (up to)&nbsp;&nbsp;nt subintervals, and a different thread will be</i></font><br>
69 <font color="#0000ed"><i>// used to process each subinterval. You still have to write the for loop</i></font><br>
70 <font color="#0000ed"><i>// yourself: the macro just declares and initializes variables &quot;first&quot; and</i></font><br>
71 <font color="#0000ed"><i>// &quot;last&quot; (or whatever you want to call them) of type long that represent the</i></font><br>
72 <font color="#0000ed"><i>// subinterval [first..last) to be processed by one thread.</i></font><br>
73 <font color="#0000ed"><i>// </i></font><br>
74 <font color="#0000ed"><i>// Note that the current thread participates as one of the nt available</i></font><br>
75 <font color="#0000ed"><i>// threads, and that the current thread will wait for all participating threads</i></font><br>
76 <font color="#0000ed"><i>// to finish their task before proceeding.</i></font><br>
77 <font color="#0000ed"><i>// </i></font><br>
78 <font color="#0000ed"><i>// Withing the &quot;body&quot; of this construct, you can freely reference any variables</i></font><br>
79 <font color="#0000ed"><i>// that are visible at this point.&nbsp;&nbsp;This is implemented using the C++ lambda</i></font><br>
80 <font color="#0000ed"><i>// feature (capturing all variables by reference).</i></font><br>
81 <font color="#0000ed"><i>// </i></font><br>
82 <font color="#0000ed"><i>// This construct will still work even if threads are disabled, in which case</i></font><br>
83 <font color="#0000ed"><i>// it runs single-threaded with first=0 and last=n.</i></font><br>
84 <font color="#0000ed"><i>// </i></font><br>
85 <font color="#0000ed"><i>// Note that the code within the EXEC_RANGE body could call other routines that</i></font><br>
86 <font color="#0000ed"><i>// themselves attempt to execute an EXEC_RANGE: if this happens, the latter</i></font><br>
87 <font color="#0000ed"><i>// EXEC_RANGE will detect this and run single-threaded.</i></font><br>
88 <font color="#0000ed"><i>// </i></font><br>
89 <font color="#0000ed"><i>// You may wish to do other things within the EXEC_RANGE body than just execute</i></font><br>
90 <font color="#0000ed"><i>// a loop.&nbsp;&nbsp;One thing you may want to do is to declare variables.&nbsp;&nbsp;Another</i></font><br>
91 <font color="#0000ed"><i>// thing you may want to do is setup a local context for a ZZ_p modulus (or</i></font><br>
92 <font color="#0000ed"><i>// other type of modulus).&nbsp;&nbsp;Here is an example of doing this:</i></font><br>
93 <br>
94 <br>
95 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ_p *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_p *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_p *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
96 &nbsp;&nbsp; {<br>
97 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ZZ_pContext context;<br>
98 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;context.save();<br>
99 <br>
100 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE(n, first, last) <br>
101 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <br>
102 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; context.restore();<br>
103 <br>
104 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++)<br>
105 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mul(x[i], a[i], b[i]);<br>
106 <br>
107 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE_END<br>
108 &nbsp;&nbsp; }<br>
109 <br>
110 <br>
111 <font color="#0000ed"><i>// Another useful function is AvailableThreads(), which will return the number</i></font><br>
112 <font color="#0000ed"><i>// of available threads.&nbsp;&nbsp;If threads or thread boosting is not enabled, this</i></font><br>
113 <font color="#0000ed"><i>// will return 1.&nbsp;&nbsp;Even if thread boosting is enabled, this may return 1 if for</i></font><br>
114 <font color="#0000ed"><i>// whatever reason, the thread pool is not available for use (for example,</i></font><br>
115 <font color="#0000ed"><i>// SetNumThreads was never called, or the thread pool is already active).</i></font><br>
116 <font color="#0000ed"><i>// </i></font><br>
117 <font color="#0000ed"><i>// A lower-level set of tools is available, which allow you to simply run a</i></font><br>
118 <font color="#0000ed"><i>// specified number of threads.&nbsp;&nbsp;Assuming nt &lt;= AvailableThreads(), the code</i></font><br>
119 <br>
120 &nbsp;&nbsp; NTL_EXEC_INDEX(nt, index)<br>
121 <br>
122 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;... code ...<br>
123 <br>
124 &nbsp;&nbsp; NTL_EXEC_INDEX_END<br>
125 <br>
126 <font color="#0000ed"><i>// will execute the body on nt different threads, each with a unique index in</i></font><br>
127 <font color="#0000ed"><i>// the range [0..nt).&nbsp;&nbsp;A variable named &quot;index&quot; (or whatever name you specify)</i></font><br>
128 <font color="#0000ed"><i>// of type long will hold the given index.</i></font><br>
129 <font color="#0000ed"><i>// </i></font><br>
130 <font color="#0000ed"><i>// This tool is useful if you need to manage memory a bit more carefully.&nbsp;&nbsp;For</i></font><br>
131 <font color="#0000ed"><i>// example, the following code will compute an inner product using all</i></font><br>
132 <font color="#0000ed"><i>// available threads:</i></font><br>
133 <br>
134 &nbsp;&nbsp; ZZ InnerProd(<font color="#008b00"><b>const</b></font>&nbsp;ZZ *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
135 &nbsp;&nbsp; {<br>
136 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;PartitionInfo pinfo(n);<br>
137 <br>
138 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;cnt = pinfo.NumIntervals();<br>
139 <br>
140 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Vec&lt;ZZ&gt; acc;<br>
141 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;acc.SetLength(cnt);<br>
142 <br>
143 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_INDEX(cnt, index)<br>
144 <br>
145 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>long</b></font>&nbsp;first, last;<br>
146 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; pinfo.interval(first, last, index);<br>
147 <br>
148 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ZZ&amp; sum = acc[index];<br>
149 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sum = <font color="#ff8b00">0</font>;<br>
150 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++) <br>
151 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;MulAddTo(sum, a[i], b[i]);<br>
152 <br>
153 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_INDEX_END<br>
154 <br>
155 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ZZ sum;<br>
156 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sum = <font color="#ff8b00">0</font>;<br>
157 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = <font color="#ff8b00">0</font>; i &lt; cnt; i++)<br>
158 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sum += acc[i];<br>
159 <br>
160 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#b02f60"><b>return</b></font>&nbsp;sum;<br>
161 &nbsp;&nbsp; }<br>
162 <br>
163 <font color="#0000ed"><i>// This example also illustrates the class PartitionInfo, which is useful for</i></font><br>
164 <font color="#0000ed"><i>// partitioning a large interval into smaller intervals (it is used internally</i></font><br>
165 <font color="#0000ed"><i>// by EXEC_RANGE).&nbsp;&nbsp;The constructor takes a single argument (in this example n)</i></font><br>
166 <font color="#0000ed"><i>// and computes a partition of [0..n) into nearly equally sized subintervals.</i></font><br>
167 <font color="#0000ed"><i>// The method NumIntervals() returns the number of subintervals, and the method</i></font><br>
168 <font color="#0000ed"><i>// interval(first, last, index) sets first and last according to the endpoints</i></font><br>
169 <font color="#0000ed"><i>// of the subinterval [first..last) with the given index.</i></font><br>
170 <font color="#0000ed"><i>// </i></font><br>
171 <font color="#0000ed"><i>// So in this example, cnt threads will run, each accumulating a sum into a</i></font><br>
172 <font color="#0000ed"><i>// corresponding element of the vector acc, and afterwords, these elements are</i></font><br>
173 <font color="#0000ed"><i>// summed.</i></font><br>
174 <font color="#0000ed"><i>// </i></font><br>
175 <font color="#0000ed"><i>// Note that if threads are not enabled or otherwise unavailable, the above</i></font><br>
176 <font color="#0000ed"><i>// code will compile and run correctly (just using one thread).</i></font><br>
177 <font color="#0000ed"><i>// </i></font><br>
178 <font color="#0000ed"><i>// Finally, there is a &quot;guarded&quot; version of NTL_EXEC_RANGE called</i></font><br>
179 <font color="#0000ed"><i>// NTL_GEXEC_RANGE.&nbsp;&nbsp;This allows one to dynamically &quot;guard&quot; against parallel</i></font><br>
180 <font color="#0000ed"><i>// execution. For example, on very small problems the runtime overhead of a</i></font><br>
181 <font color="#0000ed"><i>// parallel for loop may not be worthwhile, or in other situations parallel</i></font><br>
182 <font color="#0000ed"><i>// execution could cause incorrect behavior.&nbsp;&nbsp;See below for details.</i></font><br>
183 <br>
184 <br>
185 <font color="#0000ed"><i>// ************************** Thread Pools ******************************</i></font><br>
186 <font color="#0000ed"><i>// </i></font><br>
187 <font color="#0000ed"><i>// The above facilities are built on top of a more general thread pool class,</i></font><br>
188 <font color="#0000ed"><i>// which you may use for your own purposes.</i></font><br>
189 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;</i></font><br>
190 <font color="#0000ed"><i>// You create a thread pool by constructing a BasicThreadPool object.&nbsp;&nbsp;For</i></font><br>
191 <font color="#0000ed"><i>// example:</i></font><br>
192 <br>
193 &nbsp;&nbsp; <font color="#008b00"><b>long</b></font>&nbsp;nthreads = <font color="#ff8b00">4</font>;<br>
194 &nbsp;&nbsp; BasicThreadPool pool(nthreads);<br>
195 <br>
196 <font color="#0000ed"><i>// creates a thread pool of 4 threads.&nbsp;&nbsp;These threads will exist until the</i></font><br>
197 <font color="#0000ed"><i>// destructor for pool is called.&nbsp;&nbsp;</i></font><br>
198 <font color="#0000ed"><i>// </i></font><br>
199 <font color="#0000ed"><i>// The simplest way to use a thread pools is as follows.&nbsp;&nbsp;Suppose you have a</i></font><br>
200 <font color="#0000ed"><i>// task that consists of sz subtasks, indexed 0..sz-1.&nbsp;&nbsp;Then you can write:</i></font><br>
201 <br>
202 &nbsp;&nbsp; pool.exec_range(sz, <br>
203 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&amp;](<font color="#008b00"><b>long</b></font>&nbsp;first, <font color="#008b00"><b>long</b></font>&nbsp;last) {<br>
204 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++) {<br>
205 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;... code to process subtask i ...<br>
206 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; }<br>
207 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>
208 &nbsp;&nbsp; );<br>
209 <br>
210 <font color="#0000ed"><i>// The second argument to exec_range is a C++11 &quot;lambda&quot;.&nbsp;&nbsp;The &quot;[&amp;]&quot; indicates</i></font><br>
211 <font color="#0000ed"><i>// that all local variables in the calling context are captured by reference,</i></font><br>
212 <font color="#0000ed"><i>// so the lambda body can reference all visible local variables directly.</i></font><br>
213 <font color="#0000ed"><i>// C++11 provides other methods for capturing local variables.&nbsp;&nbsp;The interval</i></font><br>
214 <font color="#0000ed"><i>// [0..sz) is partitioned into subintervals of the form [first..last), which</i></font><br>
215 <font color="#0000ed"><i>// are processed by the code in the supplied lambda.</i></font><br>
216 <font color="#0000ed"><i>// </i></font><br>
217 <font color="#0000ed"><i>// A lower-level interface is also provided.&nbsp;&nbsp;One can write:</i></font><br>
218 <br>
219 &nbsp;&nbsp; pool.exec_index(cnt,<br>
220 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&amp;](<font color="#008b00"><b>long</b></font>&nbsp;index) {<br>
221 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ... code to process index i ...<br>
222 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>
223 &nbsp;&nbsp; );<br>
224 <br>
225 <font color="#0000ed"><i>// This will activate exactly cnt threads with indices 0..cnt-1, and execute</i></font><br>
226 <font color="#0000ed"><i>// the given code on each index.&nbsp;&nbsp;The parameter cnt must not exceed nthreads,</i></font><br>
227 <font color="#0000ed"><i>// otherwise an error is raised.</i></font><br>
228 <br>
229 <br>
230 <font color="#0000ed"><i>// ====================================================================</i></font><br>
231 <font color="#0000ed"><i>// </i></font><br>
232 <font color="#0000ed"><i>// NOTES:</i></font><br>
233 <font color="#0000ed"><i>// </i></font><br>
234 <font color="#0000ed"><i>// When one activates a thread pool with nthreads threads, the *current* thread</i></font><br>
235 <font color="#0000ed"><i>// (the one activating the pool) will also participate in the computation.</i></font><br>
236 <font color="#0000ed"><i>// This means that the thread pool only contains nthreads-1 other threads.</i></font><br>
237 <font color="#0000ed"><i>// </i></font><br>
238 <font color="#0000ed"><i>// If, during an activation, any thread throws an exception, it will be caught</i></font><br>
239 <font color="#0000ed"><i>// and rethrown in the activating thread when all the threads complete.&nbsp;&nbsp;If</i></font><br>
240 <font color="#0000ed"><i>// more than one thread throws an exception, the first one that is caught is</i></font><br>
241 <font color="#0000ed"><i>// the one that is rethrown.</i></font><br>
242 <font color="#0000ed"><i>// </i></font><br>
243 <font color="#0000ed"><i>// Methods are also provided for adding, deleting, and moving threads in and</i></font><br>
244 <font color="#0000ed"><i>// among thread pools.</i></font><br>
245 <font color="#0000ed"><i>// </i></font><br>
246 <font color="#0000ed"><i>// If NTL_THREADS=off, the corresponding header file may be included, but the</i></font><br>
247 <font color="#0000ed"><i>// BasicThreadPool class is not defined.</i></font><br>
248 <font color="#0000ed"><i>//</i></font><br>
249 <font color="#0000ed"><i>// Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence</i></font><br>
250 <font color="#0000ed"><i>// cannot be used in a Vec.&nbsp;&nbsp;One should first wrap it in a pointer class, such</i></font><br>
251 <font color="#0000ed"><i>// as UniquePtr.</i></font><br>
252 <br>
253 <br>
254 <br>
255 <font color="#0000ed"><i>// class BasicThreadPool: provided basic functionality for thread pools</i></font><br>
256 <br>
257 <font color="#008b00"><b>class</b></font>&nbsp;BasicThreadPool {<br>
258 <font color="#b02f60"><b>private</b></font>:<br>
259 <br>
260 &nbsp;&nbsp;BasicThreadPool(<font color="#008b00"><b>const</b></font>&nbsp;BasicThreadPool&amp;); <font color="#0000ed"><i>// disabled</i></font><br>
261 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;<font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font>&nbsp;BasicThreadPool&amp;); <font color="#0000ed"><i>// disabled</i></font><br>
262 <br>
263 <font color="#b02f60"><b>public</b></font>:<br>
264 <br>
265 &nbsp;&nbsp;<font color="#008b00"><b>explicit</b></font><br>
266 &nbsp;&nbsp;BasicThreadPool(<font color="#008b00"><b>long</b></font>&nbsp;nthreads);<br>
267 &nbsp;&nbsp;<font color="#0000ed"><i>// creates a pool with nthreads threads, including the current thread</i></font><br>
268 &nbsp;&nbsp;<font color="#0000ed"><i>// (so nthreads-1 other threads get created)</i></font><br>
269 <br>
270 &nbsp;&nbsp;<font color="#008b00"><b>template</b></font>&lt;<font color="#008b00"><b>class</b></font>&nbsp;Fct&gt;<br>
271 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;exec_range(<font color="#008b00"><b>long</b></font>&nbsp;sz, <font color="#008b00"><b>const</b></font>&nbsp;Fct&amp; fct); <br>
272 &nbsp;&nbsp;<font color="#0000ed"><i>// activate by range (see example usage above)</i></font><br>
273 <br>
274 &nbsp;&nbsp;<font color="#008b00"><b>template</b></font>&lt;<font color="#008b00"><b>class</b></font>&nbsp;Fct&gt;<br>
275 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;exec_index(<font color="#008b00"><b>long</b></font>&nbsp;cnt, <font color="#008b00"><b>const</b></font>&nbsp;Fct&amp; fct); <br>
276 &nbsp;&nbsp;<font color="#0000ed"><i>// activate by index (see example usage above)</i></font><br>
277 <br>
278 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;add(<font color="#008b00"><b>long</b></font>&nbsp;n = <font color="#ff8b00">1</font>);<br>
279 &nbsp;&nbsp;<font color="#0000ed"><i>// add n threads to the pool</i></font><br>
280 <br>
281 &nbsp;&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;NumThreads() <font color="#008b00"><b>const</b></font>;<br>
282 &nbsp;&nbsp;<font color="#0000ed"><i>// return number of threads (including current thread)</i></font><br>
283 <br>
284 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;remove(<font color="#008b00"><b>long</b></font>&nbsp;n = <font color="#ff8b00">1</font>);<br>
285 &nbsp;&nbsp;<font color="#0000ed"><i>// remove n threads from the pool</i></font><br>
286 &nbsp;&nbsp;<br>
287 &nbsp;&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;move(BasicThreadPool&amp; other, <font color="#008b00"><b>long</b></font>&nbsp;n = <font color="#ff8b00">1</font>) <br>
288 &nbsp;&nbsp;<font color="#0000ed"><i>// move n threads from other pool to this pool</i></font><br>
289 <br>
290 &nbsp;&nbsp;<font color="#008b00"><b>bool</b></font>&nbsp;active() <font color="#008b00"><b>const</b></font>;<br>
291 &nbsp;&nbsp;<font color="#0000ed"><i>// indicates an activation is in process: invoking any of the methods</i></font><br>
292 &nbsp;&nbsp;<font color="#0000ed"><i>// exec_index, exec_range, add, remove, move, or the destructor</i></font><br>
293 &nbsp;&nbsp;<font color="#0000ed"><i>// whie active will raise an error</i></font><br>
294 <br>
295 &nbsp;&nbsp;<font color="#008b00"><b>template</b></font>&lt;<font color="#008b00"><b>class</b></font>&nbsp;Fct&gt;<br>
296 &nbsp;&nbsp;<font color="#008b00"><b>static</b></font>&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;relaxed_exec_range(BasicThreadPool *pool, <font color="#008b00"><b>long</b></font>&nbsp;sz, <font color="#008b00"><b>const</b></font>&nbsp;Fct&amp; fct);<br>
297 &nbsp;&nbsp;<font color="#0000ed"><i>// similar to pool-&gt;exec_range(sz, fct), but will still work even </i></font><br>
298 &nbsp;&nbsp;<font color="#0000ed"><i>// if !pool or pool-&gt;active(), using just the current thread</i></font><br>
299 <br>
300 &nbsp;&nbsp;<font color="#008b00"><b>template</b></font>&lt;<font color="#008b00"><b>class</b></font>&nbsp;Fct&gt;<br>
301 &nbsp;&nbsp;<font color="#008b00"><b>static</b></font>&nbsp;<font color="#008b00"><b>void</b></font>&nbsp;relaxed_exec_index(BasicThreadPool *pool, <font color="#008b00"><b>long</b></font>&nbsp;cnt, <font color="#008b00"><b>const</b></font>&nbsp;Fct&amp; fct);<br>
302 &nbsp;&nbsp;<font color="#0000ed"><i>// similar to pool-&gt;exec_index(cnt, fct), but will still work even </i></font><br>
303 &nbsp;&nbsp;<font color="#0000ed"><i>// if !pool or pool-&gt;active(), provided cnt &lt;= 1, using just the current thread</i></font><br>
304 <br>
305 };<br>
306 <br>
307 <br>
308 <br>
309 <br>
310 <font color="#0000ed"><i>// THREAD BOOSTING FEATURES:</i></font><br>
311 <br>
312 <font color="#008b00"><b>void</b></font>&nbsp;SetNumThreads(<font color="#008b00"><b>long</b></font>&nbsp;nt);<br>
313 <font color="#0000ed"><i>// convenience routine to set NTL's thread pool.</i></font><br>
314 <font color="#0000ed"><i>// If called more than once, the old thread pool is destroyed and</i></font><br>
315 <font color="#0000ed"><i>// replaced by a new one.</i></font><br>
316 <font color="#0000ed"><i>// If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.</i></font><br>
317 <br>
318 <font color="#008b00"><b>long</b></font>&nbsp;AvailableThreads();<br>
319 <font color="#0000ed"><i>// Number of threads currently availble to use in NTL's thread pool.&nbsp;&nbsp;This is</i></font><br>
320 <font color="#0000ed"><i>// always at least 1 (for the current thread).&nbsp;&nbsp;</i></font><br>
321 <font color="#0000ed"><i>// If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.</i></font><br>
322 <br>
323 BasicThreadPool *GetThreadPool();<br>
324 <font color="#008b00"><b>void</b></font>&nbsp;ResetThreadPool(BasicThreadPool *pool = <font color="#ff8b00">0</font>);<br>
325 BasicThreadPool *ReleaseThreadPool();<br>
326 <font color="#0000ed"><i>// Routines to get and set NTL's thread pool.&nbsp;&nbsp;The interfaces parallel NTL's</i></font><br>
327 <font color="#0000ed"><i>// UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored</i></font><br>
328 <font color="#0000ed"><i>// as a UniquePtr&lt;BasicThreadPool&gt;.</i></font><br>
329 <font color="#0000ed"><i>// These are only declared when NTL_THREAD_BOOST=on.&nbsp;&nbsp;</i></font><br>
330 <br>
331 <br>
332 <font color="#1773cc">#define NTL_EXEC_RANGE(sz, first, last) ...</font><br>
333 <font color="#1773cc">#define NTL_EXEC_RANGE_END ...</font><br>
334 <font color="#1773cc">#define NTL_EXEC_INDEX(cnt, index) ...</font><br>
335 <font color="#1773cc">#define NTL_EXEC_INDEX_END ...</font><br>
336 <font color="#0000ed"><i>// convenience macros to implement &quot;parallel for loops&quot; using NTL's thread</i></font><br>
337 <font color="#0000ed"><i>// pool.&nbsp;&nbsp;See examples above for usage.&nbsp;&nbsp;If NTL_THREAD_BOOST=off, then these</i></font><br>
338 <font color="#0000ed"><i>// are still defined, and code will run on a single thread</i></font><br>
339 <br>
340 <br>
341 <font color="#1773cc">#define NTL_GEXEC_RANGE(seq, sz, first, last) ...</font><br>
342 <font color="#1773cc">#define NTL_GEXEC_RANGE_END ...</font><br>
343 <font color="#0000ed"><i>// &quot;guarded&quot; version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs</i></font><br>
344 <font color="#0000ed"><i>// on a single thread.&nbsp;&nbsp;This is useful in avoiding situations where the</i></font><br>
345 <font color="#0000ed"><i>// overhead of a parallel loop is too high.&nbsp;&nbsp;If seq evaluates to the constant</i></font><br>
346 <font color="#0000ed"><i>// true, a good compiler will optimize code to run on a single thread, with no</i></font><br>
347 <font color="#0000ed"><i>// overhead.</i></font><br>
348 <br>
349 <font color="#1773cc">#define NTL_IMPORT(x) </font><br>
350 <font color="#0000ed"><i>// To be used in conjunction with NTL_EXEC_RANGE and friends.&nbsp;&nbsp;When</i></font><br>
351 <font color="#0000ed"><i>// NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing</i></font><br>
352 <font color="#0000ed"><i>// scope to a local copy.&nbsp;&nbsp;This should only be used for types with cheap</i></font><br>
353 <font color="#0000ed"><i>// copies, such as scalars and pointers.&nbsp;&nbsp;In some situations, this allows the</i></font><br>
354 <font color="#0000ed"><i>// compiler to optimize a bit more aggressively.&nbsp;&nbsp;One or more of these may be</i></font><br>
355 <font color="#0000ed"><i>// placed right after an NTL_EXEC_RANGE.</i></font><br>
356 <font color="#0000ed"><i>// When NTL_THREAD_BOOST=off, this is still defined, and does nothing.</i></font><br>
357 <br>
358 <br>
359 <font color="#0000ed"><i>// class PartitionInfo: A helper class to facilitate partitioning an interval</i></font><br>
360 <font color="#0000ed"><i>// into subintervals.&nbsp;&nbsp;NOTE: this class is available, even when</i></font><br>
361 <font color="#0000ed"><i>// NTL_THREAD_BOOST=off.</i></font><br>
362 <br>
363 <font color="#008b00"><b>class</b></font>&nbsp;PartitionInfo {<br>
364 <font color="#b02f60"><b>public</b></font>:<br>
365 <br>
366 &nbsp;&nbsp; <font color="#008b00"><b>explicit</b></font><br>
367 &nbsp;&nbsp; PartitionInfo(<font color="#008b00"><b>long</b></font>&nbsp;sz, <font color="#008b00"><b>long</b></font>&nbsp;nt = AvailableThreads()); <br>
368 &nbsp;&nbsp; <font color="#0000ed"><i>// partitions [0..sz) into at most nt subintervals.&nbsp;&nbsp;sz may be 0 or</i></font><br>
369 &nbsp;&nbsp; <font color="#0000ed"><i>// negative, in which case the number of subintervals is 0.</i></font><br>
370 <br>
371 &nbsp;&nbsp; <font color="#008b00"><b>long</b></font>&nbsp;NumIntervals() <font color="#008b00"><b>const</b></font>;<br>
372 &nbsp;&nbsp; <font color="#0000ed"><i>// return the number of subintervals</i></font><br>
373 <br>
374 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;interval(<font color="#008b00"><b>long</b></font>&amp; first, <font color="#008b00"><b>long</b></font>&amp; last, <font color="#008b00"><b>long</b></font>&nbsp;i) <font color="#008b00"><b>const</b></font>;<br>
375 &nbsp;&nbsp; <font color="#0000ed"><i>// [first..last) is the ith interval, where i in [0..NumInvervals()).&nbsp;&nbsp;No</i></font><br>
376 &nbsp;&nbsp; <font color="#0000ed"><i>// range checking is performed.</i></font><br>
377 <br>
378 };<br>
379 <br>
380 <br>
381 <br>
382 </font></body>
383 </html>
0
1
2 /************************************************************************
3
4 MODULE: BasicThreadPool
5
6 SUMMARY:
7
8 A simple thread pool class BasicThreadPool, as well as some higher-level macros
9 which facilitite simple parallel for loops.
10
11
12 ***************************************************************************/
13
14
15 // ********************** Simple parallel for loops **************************
16 //
17 // We begin with a description of the higher-level macros for writing simple
18 // parallel for loops. These facilitaties are activated only when NTL is
19 // configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).
20 // However, code that uses these facilties should still compile and run
21 // correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so
22 // this is the simplest way to write parallel for loops across a range of
23 // compile-time and run-time environments. Note that if NTL_THREADS=on, C++11
24 // features are reqired, but when NTL_THREADS=off, these features are not
25 // required, so the code should compile on older C++ compilers.
26 //
27 // Here is a simple recipe for writing parallel for loop.
28 //
29 // At the start of program execution, your program should execute
30
31 SetNumThreads(nt);
32
33 // You can choose nt to be any positive integer, but for best results, it
34 // should correspond to the number of available cores on your machine.
35 // [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does
36 // nothing.]
37 //
38 // Now consider the following routine:
39
40 void mul(ZZ *x, const ZZ *a, const ZZ *b, long n)
41 {
42 for (long i = 0; i < n; i++)
43 mul(x[i], a[i], b[i]);
44 }
45
46 // We can parallelize it as follows:
47
48 void mul(ZZ *x, const ZZ *a, const ZZ *b, long n)
49 {
50 NTL_EXEC_RANGE(n, first, last)
51
52 for (long i = first; i < last; i++)
53 mul(x[i], a[i], b[i]);
54
55 NTL_EXEC_RANGE_END
56 }
57
58 // NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just "do the right
59 // thing". If there are nt threads available, the interval [0..n) will be
60 // partitioned into (up to) nt subintervals, and a different thread will be
61 // used to process each subinterval. You still have to write the for loop
62 // yourself: the macro just declares and initializes variables "first" and
63 // "last" (or whatever you want to call them) of type long that represent the
64 // subinterval [first..last) to be processed by one thread.
65 //
66 // Note that the current thread participates as one of the nt available
67 // threads, and that the current thread will wait for all participating threads
68 // to finish their task before proceeding.
69 //
70 // Withing the "body" of this construct, you can freely reference any variables
71 // that are visible at this point. This is implemented using the C++ lambda
72 // feature (capturing all variables by reference).
73 //
74 // This construct will still work even if threads are disabled, in which case
75 // it runs single-threaded with first=0 and last=n.
76 //
77 // Note that the code within the EXEC_RANGE body could call other routines that
78 // themselves attempt to execute an EXEC_RANGE: if this happens, the latter
79 // EXEC_RANGE will detect this and run single-threaded.
80 //
81 // You may wish to do other things within the EXEC_RANGE body than just execute
82 // a loop. One thing you may want to do is to declare variables. Another
83 // thing you may want to do is setup a local context for a ZZ_p modulus (or
84 // other type of modulus). Here is an example of doing this:
85
86
87 void mul(ZZ_p *x, const ZZ_p *a, const ZZ_p *b, long n)
88 {
89 ZZ_pContext context;
90 context.save();
91
92 NTL_EXEC_RANGE(n, first, last)
93
94 context.restore();
95
96 for (long i = first; i < last; i++)
97 mul(x[i], a[i], b[i]);
98
99 NTL_EXEC_RANGE_END
100 }
101
102
103 // Another useful function is AvailableThreads(), which will return the number
104 // of available threads. If threads or thread boosting is not enabled, this
105 // will return 1. Even if thread boosting is enabled, this may return 1 if for
106 // whatever reason, the thread pool is not available for use (for example,
107 // SetNumThreads was never called, or the thread pool is already active).
108 //
109 // A lower-level set of tools is available, which allow you to simply run a
110 // specified number of threads. Assuming nt <= AvailableThreads(), the code
111
112 NTL_EXEC_INDEX(nt, index)
113
114 ... code ...
115
116 NTL_EXEC_INDEX_END
117
118 // will execute the body on nt different threads, each with a unique index in
119 // the range [0..nt). A variable named "index" (or whatever name you specify)
120 // of type long will hold the given index.
121 //
122 // This tool is useful if you need to manage memory a bit more carefully. For
123 // example, the following code will compute an inner product using all
124 // available threads:
125
126 ZZ InnerProd(const ZZ *a, const ZZ *b, long n)
127 {
128 PartitionInfo pinfo(n);
129
130 long cnt = pinfo.NumIntervals();
131
132 Vec<ZZ> acc;
133 acc.SetLength(cnt);
134
135 NTL_EXEC_INDEX(cnt, index)
136
137 long first, last;
138 pinfo.interval(first, last, index);
139
140 ZZ& sum = acc[index];
141 sum = 0;
142 for (long i = first; i < last; i++)
143 MulAddTo(sum, a[i], b[i]);
144
145 NTL_EXEC_INDEX_END
146
147 ZZ sum;
148 sum = 0;
149 for (long i = 0; i < cnt; i++)
150 sum += acc[i];
151
152 return sum;
153 }
154
155 // This example also illustrates the class PartitionInfo, which is useful for
156 // partitioning a large interval into smaller intervals (it is used internally
157 // by EXEC_RANGE). The constructor takes a single argument (in this example n)
158 // and computes a partition of [0..n) into nearly equally sized subintervals.
159 // The method NumIntervals() returns the number of subintervals, and the method
160 // interval(first, last, index) sets first and last according to the endpoints
161 // of the subinterval [first..last) with the given index.
162 //
163 // So in this example, cnt threads will run, each accumulating a sum into a
164 // corresponding element of the vector acc, and afterwords, these elements are
165 // summed.
166 //
167 // Note that if threads are not enabled or otherwise unavailable, the above
168 // code will compile and run correctly (just using one thread).
169 //
170 // Finally, there is a "guarded" version of NTL_EXEC_RANGE called
171 // NTL_GEXEC_RANGE. This allows one to dynamically "guard" against parallel
172 // execution. For example, on very small problems the runtime overhead of a
173 // parallel for loop may not be worthwhile, or in other situations parallel
174 // execution could cause incorrect behavior. See below for details.
175
176
177 // ************************** Thread Pools ******************************
178 //
179 // The above facilities are built on top of a more general thread pool class,
180 // which you may use for your own purposes.
181 //
182 // You create a thread pool by constructing a BasicThreadPool object. For
183 // example:
184
185 long nthreads = 4;
186 BasicThreadPool pool(nthreads);
187
188 // creates a thread pool of 4 threads. These threads will exist until the
189 // destructor for pool is called.
190 //
191 // The simplest way to use a thread pools is as follows. Suppose you have a
192 // task that consists of sz subtasks, indexed 0..sz-1. Then you can write:
193
194 pool.exec_range(sz,
195 [&](long first, long last) {
196 for (long i = first; i < last; i++) {
197 ... code to process subtask i ...
198 }
199 }
200 );
201
202 // The second argument to exec_range is a C++11 "lambda". The "[&]" indicates
203 // that all local variables in the calling context are captured by reference,
204 // so the lambda body can reference all visible local variables directly.
205 // C++11 provides other methods for capturing local variables. The interval
206 // [0..sz) is partitioned into subintervals of the form [first..last), which
207 // are processed by the code in the supplied lambda.
208 //
209 // A lower-level interface is also provided. One can write:
210
211 pool.exec_index(cnt,
212 [&](long index) {
213 ... code to process index i ...
214 }
215 );
216
217 // This will activate exactly cnt threads with indices 0..cnt-1, and execute
218 // the given code on each index. The parameter cnt must not exceed nthreads,
219 // otherwise an error is raised.
220
221
222 // ====================================================================
223 //
224 // NOTES:
225 //
226 // When one activates a thread pool with nthreads threads, the *current* thread
227 // (the one activating the pool) will also participate in the computation.
228 // This means that the thread pool only contains nthreads-1 other threads.
229 //
230 // If, during an activation, any thread throws an exception, it will be caught
231 // and rethrown in the activating thread when all the threads complete. If
232 // more than one thread throws an exception, the first one that is caught is
233 // the one that is rethrown.
234 //
235 // Methods are also provided for adding, deleting, and moving threads in and
236 // among thread pools.
237 //
238 // If NTL_THREADS=off, the corresponding header file may be included, but the
239 // BasicThreadPool class is not defined.
240 //
241 // Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence
242 // cannot be used in a Vec. One should first wrap it in a pointer class, such
243 // as UniquePtr.
244
245
246
247 // class BasicThreadPool: provided basic functionality for thread pools
248
249 class BasicThreadPool {
250 private:
251
252 BasicThreadPool(const BasicThreadPool&); // disabled
253 void operator=(const BasicThreadPool&); // disabled
254
255 public:
256
257 explicit
258 BasicThreadPool(long nthreads);
259 // creates a pool with nthreads threads, including the current thread
260 // (so nthreads-1 other threads get created)
261
262 template<class Fct>
263 void exec_range(long sz, const Fct& fct);
264 // activate by range (see example usage above)
265
266 template<class Fct>
267 void exec_index(long cnt, const Fct& fct);
268 // activate by index (see example usage above)
269
270 void add(long n = 1);
271 // add n threads to the pool
272
273 long NumThreads() const;
274 // return number of threads (including current thread)
275
276 void remove(long n = 1);
277 // remove n threads from the pool
278
279 void move(BasicThreadPool& other, long n = 1)
280 // move n threads from other pool to this pool
281
282 bool active() const;
283 // indicates an activation is in process: invoking any of the methods
284 // exec_index, exec_range, add, remove, move, or the destructor
285 // whie active will raise an error
286
287 template<class Fct>
288 static void relaxed_exec_range(BasicThreadPool *pool, long sz, const Fct& fct);
289 // similar to pool->exec_range(sz, fct), but will still work even
290 // if !pool or pool->active(), using just the current thread
291
292 template<class Fct>
293 static void relaxed_exec_index(BasicThreadPool *pool, long cnt, const Fct& fct);
294 // similar to pool->exec_index(cnt, fct), but will still work even
295 // if !pool or pool->active(), provided cnt <= 1, using just the current thread
296
297 };
298
299
300
301
302 // THREAD BOOSTING FEATURES:
303
304 void SetNumThreads(long nt);
305 // convenience routine to set NTL's thread pool.
306 // If called more than once, the old thread pool is destroyed and
307 // replaced by a new one.
308 // If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.
309
310 long AvailableThreads();
311 // Number of threads currently availble to use in NTL's thread pool. This is
312 // always at least 1 (for the current thread).
313 // If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.
314
315 BasicThreadPool *GetThreadPool();
316 void ResetThreadPool(BasicThreadPool *pool = 0);
317 BasicThreadPool *ReleaseThreadPool();
318 // Routines to get and set NTL's thread pool. The interfaces parallel NTL's
319 // UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored
320 // as a UniquePtr<BasicThreadPool>.
321 // These are only declared when NTL_THREAD_BOOST=on.
322
323
324 #define NTL_EXEC_RANGE(sz, first, last) ...
325 #define NTL_EXEC_RANGE_END ...
326 #define NTL_EXEC_INDEX(cnt, index) ...
327 #define NTL_EXEC_INDEX_END ...
328 // convenience macros to implement "parallel for loops" using NTL's thread
329 // pool. See examples above for usage. If NTL_THREAD_BOOST=off, then these
330 // are still defined, and code will run on a single thread
331
332
333 #define NTL_GEXEC_RANGE(seq, sz, first, last) ...
334 #define NTL_GEXEC_RANGE_END ...
335 // "guarded" version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs
336 // on a single thread. This is useful in avoiding situations where the
337 // overhead of a parallel loop is too high. If seq evaluates to the constant
338 // true, a good compiler will optimize code to run on a single thread, with no
339 // overhead.
340
341 #define NTL_IMPORT(x)
342 // To be used in conjunction with NTL_EXEC_RANGE and friends. When
343 // NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing
344 // scope to a local copy. This should only be used for types with cheap
345 // copies, such as scalars and pointers. In some situations, this allows the
346 // compiler to optimize a bit more aggressively. One or more of these may be
347 // placed right after an NTL_EXEC_RANGE.
348 // When NTL_THREAD_BOOST=off, this is still defined, and does nothing.
349
350
351 // class PartitionInfo: A helper class to facilitate partitioning an interval
352 // into subintervals. NOTE: this class is available, even when
353 // NTL_THREAD_BOOST=off.
354
355 class PartitionInfo {
356 public:
357
358 explicit
359 PartitionInfo(long sz, long nt = AvailableThreads());
360 // partitions [0..sz) into at most nt subintervals. sz may be 0 or
361 // negative, in which case the number of subintervals is 0.
362
363 long NumIntervals() const;
364 // return the number of subintervals
365
366 void interval(long& first, long& last, long i) const;
367 // [first..last) is the ith interval, where i in [0..NumInvervals()). No
368 // range checking is performed.
369
370 };
371
372
373
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2E.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2E.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2EX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2EX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2EXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2EXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
8585 <font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
8686 <font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
8787 <font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
88 <font color="#0000ed"><i>// form ddf-*-baby-* and ddf-*-giant-*.</i></font><br>
88 <font color="#0000ed"><i>// form tmp-*.</i></font><br>
8989 <font color="#0000ed"><i>// The definition of &quot;large&quot; is controlled by the variable</i></font><br>
9090 <br>
9191 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#008b00"><b>extern</b></font>&nbsp;<font color="#008b00"><b>double</b></font>&nbsp;GF2EXFileThresh<br>
7777 // this routine uses external files to store some intermediate
7878 // results, which are removed if the routine terminates normally.
7979 // These files are stored in the current directory under names of the
80 // form ddf-*-baby-* and ddf-*-giant-*.
80 // form tmp-*.
8181 // The definition of "large" is controlled by the variable
8282
8383 extern double GF2EXFileThresh
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2X.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2X.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2XFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2XFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2XVec.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2XVec.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/HNF.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/HNF.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/LLL.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/LLL.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/Lazy.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/Lazy.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/LazyTable.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/LazyTable.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/RR.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/RR.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
2525 <br>
2626 <font color="#0000ed"><i>The minimum precision that can be set is 53 bits.</i></font><br>
2727 <font color="#0000ed"><i>The maximum precision is limited only by the word size of the machine.</i></font><br>
28 <br>
29 <font color="#0000ed"><i>A convenience class RRPush is provided to automatically save and</i></font><br>
30 <font color="#0000ed"><i>restore the current precision.</i></font><br>
2831 <br>
2932 <font color="#0000ed"><i>All arithmetic operations are implemented so that the effect is as if the</i></font><br>
3033 <font color="#0000ed"><i>result was computed exactly, and then rounded to p bits.&nbsp;&nbsp;If a number</i></font><br>
356359 ZZ RoundToZZ(<font color="#008b00"><b>const</b></font>&nbsp;RR&amp; a);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#0000ed"><i>// ties are rounded to an even integer</i></font><br>
357360 <br>
358361 <br>
362 <br>
363 <a name="push"></a>
364 <br>
365 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
366 <br>
367 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Saving and restoring the current precision</i></font><br>
368 <br>
369 <font color="#0000ed"><i>\*************************************************************************</i></font><font color="#0000ed"><i>*/</i></font><br>
370 <br>
371 <br>
372 <font color="#008b00"><b>class</b></font>&nbsp;RRPush {<br>
373 <font color="#b02f60"><b>public</b></font>:<br>
374 &nbsp;&nbsp; RRPush();&nbsp;&nbsp;<font color="#0000ed"><i>// saves the cuurent precision</i></font><br>
375 &nbsp;&nbsp; ~RRPush(); <font color="#0000ed"><i>// restores the saved precision</i></font><br>
376 <br>
377 <font color="#b02f60"><b>private</b></font>: <br>
378 &nbsp;&nbsp; RRPush(<font color="#008b00"><b>const</b></font>&nbsp;RRPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
379 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;<font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font>&nbsp;RRPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
380 };<br>
381 <br>
382 <br>
383 <font color="#0000ed"><i>// Example: </i></font><br>
384 <font color="#0000ed"><i>//</i></font><br>
385 <font color="#0000ed"><i>// {</i></font><br>
386 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;RRPush push;&nbsp;&nbsp;// don't forget to declare a variable!!</i></font><br>
387 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;RR::SetPrecsion(new_p);</i></font><br>
388 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;...</i></font><br>
389 <font color="#0000ed"><i>// } // old precsion restored when scope is exited</i></font><br>
390 <br>
391 <br>
392 <font color="#008b00"><b>class</b></font>&nbsp;RROutputPush {<br>
393 <font color="#b02f60"><b>public</b></font>:<br>
394 &nbsp;&nbsp; RROutputPush();&nbsp;&nbsp; <font color="#0000ed"><i>// saves the cuurent output precision</i></font><br>
395 &nbsp;&nbsp; ~RROutputPush();&nbsp;&nbsp;<font color="#0000ed"><i>// restores the saved output precision</i></font><br>
396 <br>
397 <font color="#b02f60"><b>private</b></font>: <br>
398 &nbsp;&nbsp; RROutputPush(<font color="#008b00"><b>const</b></font>&nbsp;RROutputPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
399 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;<font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font>&nbsp;RROutputPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
400 };<br>
401 <br>
402 <br>
403 <font color="#0000ed"><i>// Example: </i></font><br>
404 <font color="#0000ed"><i>//</i></font><br>
405 <font color="#0000ed"><i>// {</i></font><br>
406 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;RROutputPush push;&nbsp;&nbsp;// don't forget to declare a variable!!</i></font><br>
407 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;RR::SetOutputPrecsion(new_op);</i></font><br>
408 <font color="#0000ed"><i>//&nbsp;&nbsp;&nbsp;&nbsp;...</i></font><br>
409 <font color="#0000ed"><i>// } // old output precsion restored when scope is exited</i></font><br>
410 <br>
411 <br>
412 <br>
413 <br>
359414 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
360415 <br>
361416 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Miscelaneous</i></font><br>
1717
1818 The minimum precision that can be set is 53 bits.
1919 The maximum precision is limited only by the word size of the machine.
20
21 A convenience class RRPush is provided to automatically save and
22 restore the current precision.
2023
2124 All arithmetic operations are implemented so that the effect is as if the
2225 result was computed exactly, and then rounded to p bits. If a number
348351 ZZ RoundToZZ(const RR& a); // ties are rounded to an even integer
349352
350353
354
355 // @anchor{push}
356
357 /**************************************************************************\
358
359 Saving and restoring the current precision
360
361 \**************************************************************************/
362
363
364 class RRPush {
365 public:
366 RRPush(); // saves the cuurent precision
367 ~RRPush(); // restores the saved precision
368
369 private:
370 RRPush(const RRPush&); // disable
371 void operator=(const RRPush&); // disable
372 };
373
374
375 // Example:
376 //
377 // {
378 // RRPush push; // don't forget to declare a variable!!
379 // RR::SetPrecsion(new_p);
380 // ...
381 // } // old precsion restored when scope is exited
382
383
384 class RROutputPush {
385 public:
386 RROutputPush(); // saves the cuurent output precision
387 ~RROutputPush(); // restores the saved output precision
388
389 private:
390 RROutputPush(const RROutputPush&); // disable
391 void operator=(const RROutputPush&); // disable
392 };
393
394
395 // Example:
396 //
397 // {
398 // RROutputPush push; // don't forget to declare a variable!!
399 // RR::SetOutputPrecsion(new_op);
400 // ...
401 // } // old output precsion restored when scope is exited
402
403
404
405
351406 /**************************************************************************\
352407
353408 Miscelaneous
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/SmartPtr.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/SmartPtr.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
405405 &nbsp;&nbsp; <font color="#0000ed"><i>// returns raw pointer, and sets the raw pointer to null</i></font><br>
406406 <br>
407407 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;move(UniquePtr&amp; other);<br>
408 &nbsp;&nbsp; <font color="#008b00"><b>template</b></font>&lt;<font color="#008b00"><b>class</b></font>&nbsp;Y&gt; <font color="#008b00"><b>void</b></font>&nbsp;move(UniqePtr&lt;Y&gt;&amp; other);<br>
408409 &nbsp;&nbsp; <font color="#0000ed"><i>// move other to *this</i></font><br>
410 &nbsp;&nbsp; <font color="#0000ed"><i>// in the second version, Y* should be convertable to T*</i></font><br>
411 &nbsp;&nbsp; <font color="#0000ed"><i>// NOTE: if Y is a subclass of T, then typically, ~T() should</i></font><br>
412 &nbsp;&nbsp; <font color="#0000ed"><i>// be virtual, to ensure that the destructor for Y is called</i></font><br>
409413 <br>
410414 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;swap(UniquePtr&amp; other);<br>
411415 &nbsp;&nbsp; <font color="#0000ed"><i>// swap raw pointers</i></font><br>
463467 <br>
464468 <font color="#0000ed"><i>&nbsp;&nbsp; p1.val()&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; // dereference</i></font><br>
465469 <br>
470 <font color="#0000ed"><i>&nbsp;&nbsp; rp = p1.get();&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; // fetch raw pointer</i></font><br>
471 <font color="#0000ed"><i>&nbsp;&nbsp; rp = p1.release();&nbsp;&nbsp; // fetch raw pointer, and set to NULL</i></font><br>
466472 <font color="#0000ed"><i>&nbsp;&nbsp; p1.move(p2);&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; // move p2 to p1, destroying p1's referent</i></font><br>
467473 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;//&nbsp;&nbsp; if p1 != p2</i></font><br>
468474 <br>
510516 &nbsp;&nbsp; <font color="#008b00"><b>bool</b></font>&nbsp;exists() <font color="#008b00"><b>const</b></font>;<br>
511517 &nbsp;&nbsp; <font color="#0000ed"><i>// checks that underlying pointer is not null</i></font><br>
512518 <br>
519 &nbsp;&nbsp; T* get() <font color="#008b00"><b>const</b></font>;<br>
520 &nbsp;&nbsp; <font color="#0000ed"><i>// returns underlying raw pointer</i></font><br>
521 <br>
522 &nbsp;&nbsp; T* release();<br>
523 &nbsp;&nbsp; <font color="#0000ed"><i>// returns raw pointer, and sets the raw pointer to null</i></font><br>
524 <br>
513525 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;move(OptionalVal&amp; other);<br>
514526 &nbsp;&nbsp; <font color="#0000ed"><i>// performs a (shallow) pointer move</i></font><br>
515527 <br>
595607 <br>
596608 &nbsp;&nbsp; T* get() <font color="#008b00"><b>const</b></font>;<br>
597609 &nbsp;&nbsp; <font color="#0000ed"><i>// get raw pointer</i></font><br>
610 <br>
611 &nbsp;&nbsp; T* elts() <font color="#008b00"><b>const</b></font>;<br>
612 &nbsp;&nbsp; <font color="#0000ed"><i>// get raw pointer (for compatibility with the Vec class)</i></font><br>
598613 <br>
599614 &nbsp;&nbsp; T* release();<br>
600615 &nbsp;&nbsp; <font color="#0000ed"><i>// get raw pointer and reset to null</i></font><br>
397397 // returns raw pointer, and sets the raw pointer to null
398398
399399 void move(UniquePtr& other);
400 template<class Y> void move(UniqePtr<Y>& other);
400401 // move other to *this
402 // in the second version, Y* should be convertable to T*
403 // NOTE: if Y is a subclass of T, then typically, ~T() should
404 // be virtual, to ensure that the destructor for Y is called
401405
402406 void swap(UniquePtr& other);
403407 // swap raw pointers
455459
456460 p1.val() // dereference
457461
462 rp = p1.get(); // fetch raw pointer
463 rp = p1.release(); // fetch raw pointer, and set to NULL
458464 p1.move(p2); // move p2 to p1, destroying p1's referent
459465 // if p1 != p2
460466
502508 bool exists() const;
503509 // checks that underlying pointer is not null
504510
511 T* get() const;
512 // returns underlying raw pointer
513
514 T* release();
515 // returns raw pointer, and sets the raw pointer to null
516
505517 void move(OptionalVal& other);
506518 // performs a (shallow) pointer move
507519
587599
588600 T* get() const;
589601 // get raw pointer
602
603 T* elts() const;
604 // get raw pointer (for compatibility with the Vec class)
590605
591606 T* release();
592607 // get raw pointer and reset to null
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
326326 };<br>
327327 <br>
328328 <font color="#008b00"><b>long</b></font>&nbsp;InvModStatus(ZZ&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; n);<br>
329 <font color="#0000ed"><i>// if gcd(a,b) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
329 <font color="#0000ed"><i>// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
330330 <font color="#0000ed"><i>// otherwise, return-value = 1, x = gcd(a, n)</i></font><br>
331331 <br>
332332 <font color="#008b00"><b>void</b></font>&nbsp;PowerMod(ZZ&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; n);<br>
400400 <br>
401401 <font color="#008b00"><b>long</b></font>&nbsp;InvMod(<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>long</b></font>&nbsp;n);<br>
402402 <font color="#0000ed"><i>// computes a^{-1} mod n.&nbsp;&nbsp;Error is raised if undefined.</i></font><br>
403 <br>
404 <font color="#008b00"><b>long</b></font>&nbsp;InvModStatus(<font color="#008b00"><b>long</b></font>&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>long</b></font>&nbsp;n);<br>
405 <font color="#0000ed"><i>// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
406 <font color="#0000ed"><i>// otherwise, return-value = 1, x = gcd(a, n)</i></font><br>
403407 <br>
404408 <font color="#008b00"><b>long</b></font>&nbsp;PowerMod(<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>long</b></font>&nbsp;e, <font color="#008b00"><b>long</b></font>&nbsp;n);<br>
405409 <font color="#0000ed"><i>// computes a^e mod n (e may be negative)</i></font><br>
654658 <font color="#0000ed"><i>// NumBytes(0) == 0.</i></font><br>
655659 <br>
656660 <br>
661 <a name="prg"></a>
657662 <br>
658663 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
659664 <br>
673678 <font color="#0000ed"><i>// seen by a client program.</i></font><br>
674679 <br>
675680 <br>
676 <font color="#008b00"><b>void</b></font>&nbsp;SetSeed(<font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; s); <br>
677 <font color="#0000ed"><i>// Initializes generator with a &quot;seed&quot; s.</i></font><br>
678 <font color="#0000ed"><i>// s is first hashed to generate the initial state, so it is</i></font><br>
679 <font color="#0000ed"><i>// not necessary that s itself looks random, just that </i></font><br>
680 <font color="#0000ed"><i>// it has a lot of &quot;entropy&quot;.</i></font><br>
681 <font color="#0000ed"><i>// If SetSeed is not called before using the routines below,</i></font><br>
682 <font color="#0000ed"><i>// a default initial seed is used.</i></font><br>
683 <font color="#0000ed"><i>// This default seed is guaranteed to be unique among different</i></font><br>
684 <font color="#0000ed"><i>// threads in a given process, and an attempt is made to </i></font><br>
685 <font color="#0000ed"><i>// make this seed globally unique among all threads and processes.</i></font><br>
686 <font color="#0000ed"><i>// Routine ZZFromBytes (above) may be useful for constructing seeds</i></font><br>
687 <font color="#0000ed"><i>// from arbitrary binary data.</i></font><br>
681 <font color="#008b00"><b>void</b></font>&nbsp;SetSeed(<font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; s);<br>
682 <font color="#008b00"><b>void</b></font>&nbsp;SetSeed(<font color="#008b00"><b>const</b></font>&nbsp;<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>char</b></font>&nbsp;*data, <font color="#008b00"><b>long</b></font>&nbsp;dlen);<br>
683 <font color="#008b00"><b>void</b></font>&nbsp;SetSeed(<font color="#008b00"><b>const</b></font>&nbsp;RandomStream&amp; s);<br>
684 <font color="#0000ed"><i>// Initializes generator with a &quot;seed&quot;.</i></font><br>
685 <br>
686 <font color="#0000ed"><i>// The first version hashes the binary representation of s to obtain a key for</i></font><br>
687 <font color="#0000ed"><i>// a low-level RandomStream object (see below).</i></font><br>
688 <br>
689 <font color="#0000ed"><i>// The second version does the same, hashing the first dlen bytes pointed to by</i></font><br>
690 <font color="#0000ed"><i>// data to obtain a key for the RandomStream object.</i></font><br>
691 <br>
692 <font color="#0000ed"><i>// The third version initializes the PRG state directly with the given</i></font><br>
693 <font color="#0000ed"><i>// RandomStream object.</i></font><br>
694 <br>
695 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
688696 <br>
689697 <br>
690698 <font color="#008b00"><b>void</b></font>&nbsp;RandomBnd(ZZ&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; n);<br>
691699 ZZ RandomBnd(<font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; n);<br>
700 <font color="#008b00"><b>void</b></font>&nbsp;RandomBnd(<font color="#008b00"><b>long</b></font>&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;n);<br>
692701 <font color="#008b00"><b>long</b></font>&nbsp;RandomBnd(<font color="#008b00"><b>long</b></font>&nbsp;n);<br>
693702 <font color="#0000ed"><i>// x = pseudo-random number in the range 0..n-1, or 0 if n &lt;= 0</i></font><br>
703 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
694704 <br>
695705 <font color="#008b00"><b>void</b></font>&nbsp;RandomBits(ZZ&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;l);<br>
696706 ZZ RandomBits_ZZ(<font color="#008b00"><b>long</b></font>&nbsp;l);<br>
707 <font color="#008b00"><b>void</b></font>&nbsp;RandomBits(<font color="#008b00"><b>long</b></font>&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;l);<br>
697708 <font color="#008b00"><b>long</b></font>&nbsp;RandomBits_long(<font color="#008b00"><b>long</b></font>&nbsp;l);<br>
698709 <font color="#0000ed"><i>// x = pseudo-random number in the range 0..2^l-1.</i></font><br>
710 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
699711 <br>
700712 <font color="#008b00"><b>void</b></font>&nbsp;RandomLen(ZZ&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;l);<br>
701713 ZZ RandomLen_ZZ(<font color="#008b00"><b>long</b></font>&nbsp;l);<br>
714 <font color="#008b00"><b>void</b></font>&nbsp;RandomLen(<font color="#008b00"><b>long</b></font>&amp; x, <font color="#008b00"><b>long</b></font>&nbsp;l);<br>
702715 <font color="#008b00"><b>long</b></font>&nbsp;RandomLen_long(<font color="#008b00"><b>long</b></font>&nbsp;l);<br>
703716 <font color="#0000ed"><i>// x = psuedo-random number with precisely l bits,</i></font><br>
704717 <font color="#0000ed"><i>// or 0 of l &lt;= 0.</i></font><br>
718 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
705719 <br>
706720 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;RandomBits_ulong(<font color="#008b00"><b>long</b></font>&nbsp;l);<br>
707721 <font color="#0000ed"><i>// returns a pseudo-random number in the range 0..2^l-1</i></font><br>
722 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
708723 <br>
709724 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;RandomWord();<br>
710725 <font color="#0000ed"><i>// returns a word filled with pseudo-random bits.</i></font><br>
711726 <font color="#0000ed"><i>// Equivalent to RandomBits_ulong(NTL_BITS_PER_LONG).</i></font><br>
727 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
728 <br>
729 <br>
730 <br>
731 <font color="#008b00"><b>class</b></font>&nbsp;RandomStream { <br>
732 <font color="#0000ed"><i>// The low-level pseudo-random generator (PRG).</i></font><br>
733 <font color="#0000ed"><i>// After initializing it with a key, one can effectively read an unbounded</i></font><br>
734 <font color="#0000ed"><i>// stream of pseudorandom bytes</i></font><br>
735 <br>
736 <font color="#b02f60"><b>public</b></font>:<br>
737 <br>
738 &nbsp;&nbsp; <font color="#008b00"><b>explicit</b></font>&nbsp;RandomStream(<font color="#008b00"><b>const</b></font>&nbsp;<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>char</b></font>&nbsp;*key);<br>
739 &nbsp;&nbsp; <font color="#0000ed"><i>// key should point to an array of NTL_PRG_KEYLEN bytes</i></font><br>
740 &nbsp;&nbsp; <font color="#0000ed"><i>// EXCEPTIONS: nothrow</i></font><br>
741 <br>
742 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;get(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>char</b></font>&nbsp;*res, <font color="#008b00"><b>long</b></font>&nbsp;n); <br>
743 &nbsp;&nbsp; <font color="#0000ed"><i>// read the next n bytes from the stream and store to location pointed to by</i></font><br>
744 &nbsp;&nbsp; <font color="#0000ed"><i>// res</i></font><br>
745 &nbsp;&nbsp; <font color="#0000ed"><i>// EXCEPTIONS: throws a LogicError exception if n is negative</i></font><br>
746 <br>
747 &nbsp;&nbsp; RandomStream(<font color="#008b00"><b>const</b></font>&nbsp;RandomStream&amp;); <font color="#0000ed"><i>// default</i></font><br>
748 &nbsp;&nbsp; RandomStream&amp; <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font>&nbsp;RandomStream&amp;); <font color="#0000ed"><i>// default</i></font><br>
749 &nbsp;&nbsp; <font color="#0000ed"><i>// EXCEPTIONS: nothrow</i></font><br>
750 };<br>
751 <br>
752 <br>
753 RandomStream&amp; GetCurrentRandomStream();<br>
754 <font color="#0000ed"><i>// get reference to the current PRG state. If SetSeed has not been called, it</i></font><br>
755 <font color="#0000ed"><i>// is called with a default value (which should be unique to each</i></font><br>
756 <font color="#0000ed"><i>// process/thread).&nbsp;&nbsp;NOTE: this is a reference to a thread-local object, so</i></font><br>
757 <font color="#0000ed"><i>// different threads will use different PRG's, and by default, each will be</i></font><br>
758 <font color="#0000ed"><i>// initialized with a unique seed.</i></font><br>
759 <font color="#0000ed"><i>// NOTE: using this reference, you can copy the current PRG state or assign a</i></font><br>
760 <font color="#0000ed"><i>// different value to it; however, see the helper class RandomStreamPush below,</i></font><br>
761 <font color="#0000ed"><i>// which may be more convenient.</i></font><br>
762 <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
763 <br>
764 <br>
765 <br>
766 <font color="#008b00"><b>class</b></font>&nbsp;RandomStreamPush {<br>
767 <font color="#0000ed"><i>// RAII for saving/restoring current PRG state</i></font><br>
768 <font color="#b02f60"><b>public</b></font>:<br>
769 &nbsp;&nbsp; RandomStreamPush();&nbsp;&nbsp; <font color="#0000ed"><i>// save a copy of the current PRG state</i></font><br>
770 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
771 <br>
772 &nbsp;&nbsp; ~RandomStreamPush();&nbsp;&nbsp;<font color="#0000ed"><i>// restore the saveed copy of the PRG state</i></font><br>
773 <br>
774 <font color="#b02f60"><b>private</b></font>: <br>
775 &nbsp;&nbsp; RandomStreamPush(<font color="#008b00"><b>const</b></font>&nbsp;RandomStreamPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
776 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;<font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font>&nbsp;RandomStreamPush&amp;); <font color="#0000ed"><i>// disable</i></font><br>
777 };<br>
778 <br>
779 <br>
780 <font color="#008b00"><b>void</b></font>&nbsp;DeriveKey(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>char</b></font>&nbsp;*key, <font color="#008b00"><b>long</b></font>&nbsp;klen,&nbsp;&nbsp;<br>
781 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>char</b></font>&nbsp;*data, <font color="#008b00"><b>long</b></font>&nbsp;dlen);<br>
782 <font color="#0000ed"><i>// utility routine to derive from the byte string (data, dlen) a byte string</i></font><br>
783 <font color="#0000ed"><i>// (key, klen).&nbsp;&nbsp;Heuristically, if (data, dlen) has high entropy, then (key,</i></font><br>
784 <font color="#0000ed"><i>// klen) should be pseudorandom.&nbsp;&nbsp;This routine is also used internally to</i></font><br>
785 <font color="#0000ed"><i>// derive PRG keys.</i></font><br>
786 <font color="#0000ed"><i>// EXCEPTIONS: throws LogicError exception if klen &lt; 0 or hlen &lt; 0</i></font><br>
787 <br>
712788 <br>
713789 <br>
714790 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
318318 };
319319
320320 long InvModStatus(ZZ& x, const ZZ& a, const ZZ& n);
321 // if gcd(a,b) = 1, then return-value = 0, x = a^{-1} mod n;
321 // if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;
322322 // otherwise, return-value = 1, x = gcd(a, n)
323323
324324 void PowerMod(ZZ& x, const ZZ& a, const ZZ& e, const ZZ& n);
392392
393393 long InvMod(long a, long n);
394394 // computes a^{-1} mod n. Error is raised if undefined.
395
396 long InvModStatus(long& x, long a, long n);
397 // if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;
398 // otherwise, return-value = 1, x = gcd(a, n)
395399
396400 long PowerMod(long a, long e, long n);
397401 // computes a^e mod n (e may be negative)
646650 // NumBytes(0) == 0.
647651
648652
653 // @anchor{prg}
649654
650655 /**************************************************************************\
651656
665670 // seen by a client program.
666671
667672
668 void SetSeed(const ZZ& s);
669 // Initializes generator with a "seed" s.
670 // s is first hashed to generate the initial state, so it is
671 // not necessary that s itself looks random, just that
672 // it has a lot of "entropy".
673 // If SetSeed is not called before using the routines below,
674 // a default initial seed is used.
675 // This default seed is guaranteed to be unique among different
676 // threads in a given process, and an attempt is made to
677 // make this seed globally unique among all threads and processes.
678 // Routine ZZFromBytes (above) may be useful for constructing seeds
679 // from arbitrary binary data.
673 void SetSeed(const ZZ& s);
674 void SetSeed(const unsigned char *data, long dlen);
675 void SetSeed(const RandomStream& s);
676 // Initializes generator with a "seed".
677
678 // The first version hashes the binary representation of s to obtain a key for
679 // a low-level RandomStream object (see below).
680
681 // The second version does the same, hashing the first dlen bytes pointed to by
682 // data to obtain a key for the RandomStream object.
683
684 // The third version initializes the PRG state directly with the given
685 // RandomStream object.
686
687 // EXCEPTIONS: strong ES
680688
681689
682690 void RandomBnd(ZZ& x, const ZZ& n);
683691 ZZ RandomBnd(const ZZ& n);
692 void RandomBnd(long& x, long n);
684693 long RandomBnd(long n);
685694 // x = pseudo-random number in the range 0..n-1, or 0 if n <= 0
695 // EXCEPTIONS: strong ES
686696
687697 void RandomBits(ZZ& x, long l);
688698 ZZ RandomBits_ZZ(long l);
699 void RandomBits(long& x, long l);
689700 long RandomBits_long(long l);
690701 // x = pseudo-random number in the range 0..2^l-1.
702 // EXCEPTIONS: strong ES
691703
692704 void RandomLen(ZZ& x, long l);
693705 ZZ RandomLen_ZZ(long l);
706 void RandomLen(long& x, long l);
694707 long RandomLen_long(long l);
695708 // x = psuedo-random number with precisely l bits,
696709 // or 0 of l <= 0.
710 // EXCEPTIONS: strong ES
697711
698712 unsigned long RandomBits_ulong(long l);
699713 // returns a pseudo-random number in the range 0..2^l-1
714 // EXCEPTIONS: strong ES
700715
701716 unsigned long RandomWord();
702717 // returns a word filled with pseudo-random bits.
703718 // Equivalent to RandomBits_ulong(NTL_BITS_PER_LONG).
719 // EXCEPTIONS: strong ES
720
721
722
723 class RandomStream {
724 // The low-level pseudo-random generator (PRG).
725 // After initializing it with a key, one can effectively read an unbounded
726 // stream of pseudorandom bytes
727
728 public:
729
730 explicit RandomStream(const unsigned char *key);
731 // key should point to an array of NTL_PRG_KEYLEN bytes
732 // EXCEPTIONS: nothrow
733
734 void get(unsigned char *res, long n);
735 // read the next n bytes from the stream and store to location pointed to by
736 // res
737 // EXCEPTIONS: throws a LogicError exception if n is negative
738
739 RandomStream(const RandomStream&); // default
740 RandomStream& operator=(const RandomStream&); // default
741 // EXCEPTIONS: nothrow
742 };
743
744
745 RandomStream& GetCurrentRandomStream();
746 // get reference to the current PRG state. If SetSeed has not been called, it
747 // is called with a default value (which should be unique to each
748 // process/thread). NOTE: this is a reference to a thread-local object, so
749 // different threads will use different PRG's, and by default, each will be
750 // initialized with a unique seed.
751 // NOTE: using this reference, you can copy the current PRG state or assign a
752 // different value to it; however, see the helper class RandomStreamPush below,
753 // which may be more convenient.
754 // EXCEPTIONS: strong ES
755
756
757
758 class RandomStreamPush {
759 // RAII for saving/restoring current PRG state
760 public:
761 RandomStreamPush(); // save a copy of the current PRG state
762 // EXCEPTIONS: strong ES
763
764 ~RandomStreamPush(); // restore the saveed copy of the PRG state
765
766 private:
767 RandomStreamPush(const RandomStreamPush&); // disable
768 void operator=(const RandomStreamPush&); // disable
769 };
770
771
772 void DeriveKey(unsigned char *key, long klen,
773 const unsigned char *data, long dlen);
774 // utility routine to derive from the byte string (data, dlen) a byte string
775 // (key, klen). Heuristically, if (data, dlen) has high entropy, then (key,
776 // klen) should be pseudorandom. This routine is also used internally to
777 // derive PRG keys.
778 // EXCEPTIONS: throws LogicError exception if klen < 0 or hlen < 0
779
704780
705781
706782 /**************************************************************************\
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZVec.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZVec.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
274274 <br>
275275 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ZZ_p::init(p2); // install p2</i></font><br>
276276 <br>
277 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// reinstall original modulus as close of scope</i></font><br>
277 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// reinstall original modulus at close of scope</i></font><br>
278278 <font color="#0000ed"><i>&nbsp;&nbsp; }</i></font><br>
279279 <br>
280280 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</i></font><br>
266266
267267 ZZ_p::init(p2); // install p2
268268
269 // reinstall original modulus as close of scope
269 // reinstall original modulus at close of scope
270270 }
271271
272272
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pEX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pEX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pEXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pEXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6464 <font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
6565 <font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
6666 <font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
67 <font color="#0000ed"><i>// form ddf-*-baby-* and ddf-*-giant-*.</i></font><br>
67 <font color="#0000ed"><i>// form tmp-*.</i></font><br>
6868 <font color="#0000ed"><i>// The definition of &quot;large&quot; is controlled by the variable</i></font><br>
6969 <br>
7070 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#008b00"><b>extern</b></font>&nbsp;<font color="#008b00"><b>double</b></font>&nbsp;ZZ_pEXFileThresh<br>
5656 // this routine uses external files to store some intermediate
5757 // results, which are removed if the routine terminates normally.
5858 // These files are stored in the current directory under names of the
59 // form ddf-*-baby-* and ddf-*-giant-*.
59 // form tmp-*.
6060 // The definition of "large" is controlled by the variable
6161
6262 extern double ZZ_pEXFileThresh
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
464464 <font color="#008b00"><b>void</b></font>&nbsp;MulByXMod(ZZ_pX&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; f);<br>
465465 ZZ_pX MulByXMod(<font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; f);<br>
466466 <font color="#0000ed"><i>// x = (a * X) mod f</i></font><br>
467 <font color="#0000ed"><i>// NOTE: thread boosting enabled only if x does not alias a</i></font><br>
467468 <br>
468469 <font color="#008b00"><b>void</b></font>&nbsp;InvMod(ZZ_pX&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; f);<br>
469470 ZZ_pX InvMod(<font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_pX&amp; f);<br>
456456 void MulByXMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& f);
457457 ZZ_pX MulByXMod(const ZZ_pX& a, const ZZ_pX& f);
458458 // x = (a * X) mod f
459 // NOTE: thread boosting enabled only if x does not alias a
459460
460461 void InvMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& f);
461462 ZZ_pX InvMod(const ZZ_pX& a, const ZZ_pX& f);
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
8181 <font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
8282 <font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
8383 <font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
84 <font color="#0000ed"><i>// form ddf-*-baby-* and ddf-*-giant-*.&nbsp;&nbsp;</i></font><br>
84 <font color="#0000ed"><i>// form tmp-*.</i></font><br>
8585 <font color="#0000ed"><i>// The definition of &quot;large&quot; is controlled by the variable</i></font><br>
8686 <br>
8787 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#008b00"><b>extern</b></font>&nbsp;<font color="#008b00"><b>double</b></font>&nbsp;ZZ_pXFileThresh<br>
7373 // this routine uses external files to store some intermediate
7474 // results, which are removed if the routine terminates normally.
7575 // These files are stored in the current directory under names of the
76 // form ddf-*-baby-* and ddf-*-giant-*.
76 // form tmp-*.
7777 // The definition of "large" is controlled by the variable
7878
7979 extern double ZZ_pXFileThresh
1717 ########### Here are the most important variables, and their default values.
1818
1919 CXX=g++ # The C++ compiler
20
2021 CXXFLAGS=-g -O2 # C++ complilation flags
2122
22 DEF_PREFIX=/usr/local # Default software directory
23 NATIVE=on # compiles code targeted to current hardware
24
25 DEF_PREFIX=/usr/local# Default software directory
26
2327 PREFIX=$(DEF_PREFIX) # Directory in which to install NTL library components
2428 SHARED=off # Generate a shared library (as well as static)
2529
2630 NTL_THREADS=off # compile in thread-safe mode
27 NTL_EXCEPTIONS=off # compile in thread-safe mode
28
29 NTL_GMP_LIP=off # Switch to enable the use of GMP as primary
31 NTL_THREAD_BOOST=off # compile with thread boosting enabled
32 NTL_EXCEPTIONS=off # compile in exception-safe mode
33
34 NTL_GMP_LIP=on # Switch to enable the use of GMP as primary
3035 # long integer package
3136
3237 GMP_PREFIX=$(DEF_PREFIX) # Directory in which GMP components are installed
3338
34 NTL_PCLMUL=off # switch to enable the PCLMUL instruction
35 # on x86 machines for faster arithmetic over
36 # GF(2)[X] (without relying on the gf2x package)
3739
3840 NTL_GF2X_LIB=off # Switch to enable the use of the gf2x package
3941 # for faster arithmetic over GF(2)[X]
5658
5759 CXXFLAGS=-g -O2
5860
59 # Flags for the C++ compiler
60 #
61 # Note that if CXXFLAGS has not been explicitly set,
62 # then the option -std=c++11 is added if either NTL_THREADS
63 # or NTL_EXCEPTIONS is set, and the option -pthread is added
64 # if NTL_THREADS is set.
61 # Flags for the C++ compiler.
62
63
64 NATIVE=on
65
66 # Flag to target code to current hardware.
67
6568
6669
6770 ########## Installation path:
113116 # mutexes, and thread_local storage. Your compiler may not
114117 # yet support these features.
115118
119 # Note that this option is currently only supported with
120 # NTL_GMP_LIP=on.
121
122 ########## thread boosting
123
124 NTL_THREAD_BOOST=off
125
126 # Set to 'on' if you want to compile NTL so that is does
127 # certain internal computations using multiple threads.
128 # Setting this flag automatically sets the NTL_THREADS flag.
129 # This feature is a work in progress. See documentation in
130 # BasicThreadPool.txt for more details.
116131
117132 ########## exceptions
118133
125140
126141 ########## GMP variables:
127142
128 NTL_GMP_LIP=off
129
130 # Set to 'on' if you want to use GMP, the GNU Multi-Precision package,
143 NTL_GMP_LIP=on
144
145 # Set to 'off' if you don't want to use GMP, the GNU Multi-Precision package,
131146 # as the primary long integer package.
132 # This will typically yield significantly faster long integer arithmetic
133 # compared to the traditional long integer package.
134
135 # If you set this flag, please note the following.
136 # If you have installed GMP in a standard "system" location, this is
137 # all you have to do. Otherwise, if GMP is built, but not installed
138 # in a standard place, you have to set the variable GMP_PREFIX.
147 # This will lead to significantly slower code, and is not
148 # recommended.
139149
140150
141151 GMP_PREFIX=$(DEF_PREFIX)
149159 # For finer-grained control, set the variables GMP_INCDIR and GMP_LIBDIR
150160 # instead (see below).
151161
152 ######### PCLMUL accelerator:
153
154 NTL_PCLMUL=off
155
156 # set to 'on' if you want to enable the use of the PCLMUL
157 # instruction on x86 machines. The configuration script
158 # will adjust the makefile as well, and run a test program
159 # to make sure it really works.
160 #
161 # This is an aletrnative to using the gf2x library (see below).
162 # That library is currently not thread or exception safe.
163
164162
165163 ########## GF2X variables:
166164
206204
207205
208206
209 # If GMP is installed in a standard system directory, and you want to use it:
210
211 ./configure NTL_GMP_LIP=on
212
213
214207
215208 # If GMP was installed in a non-standard directory, say, $HOME/sw:
216209
217 ./configure NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
218
210 ./configure GMP_PREFIX=$HOME/sw
219211
220212
221213 # If you want to use the options -g and -O for compiling C++,
226218 # Note the use of quotes to keep the argument in one piece.
227219
228220
229 # If you want to use both GMP and the gf2x library:
230
231 ./configure NTL_GMP_LIP=on NTL_GF2X_LIB=on
232
233
234 # If you want to use GMP as well as traditional (non-ISO) mode:
235
236 ./configure NTL_GMP_LIP=on NTL_STD_CXX=off
237
221 # If you want to use the gf2x library:
222
223 ./configure NTL_GF2X_LIB=on
224
225
226
227 ###########
228 ########### A little magic
229 ###########
230
231 CXXAUTOFLAGS=
232
233 # This is a variable that is automagically set by the configuration script.
234 # These are C++ compiler flags that are selected depending on
235 # the choice of other configuration options, and is geared towards gcc.
236 # The configuration script always prints out the value it chooses.
237 # If you explicitly set a value when invoking the configuration script,
238 # then it will not change that value.
238239
239240
240241
255256 INCLUDEDIR=$(PREFIX)/include
256257 DOCDIR=$(PREFIX)/share/doc
257258
259 NTL_DISABLE_TLS_HACK=off
260 NTL_ENABLE_TLS_HACK=off
258261
259262 NTL_LEGACY_NO_NAMESPACE=off
260263 NTL_LEGACY_INPUT_ERROR=off
271274 NTL_NO_INIT_TRANS=off
272275 NTL_DISABLE_LONGDOUBLE=off
273276 NTL_DISABLE_LONGLONG=off
277 NTL_DISABLE_LL_ASM=off
278 NTL_MAXIMIZE_SP_NBITS=off
274279
275280 WIZARD=on
276281 NTL_LONG_LONG=off
285290 NTL_GF2X_NOINLINE=off
286291 NTL_GF2X_ALTCODE=off
287292 NTL_GF2X_ALTCODE1=off
293 NTL_PCLMUL=off
288294
289295 GMP_INCDIR=$(GMP_PREFIX)/include
290296 GMP_LIBDIR=$(GMP_PREFIX)/lib
338344 LIBTOOL=libtool
339345
340346 # the libtool command -- only needed if SHARED=on
347
341348
342349
343350
367374 # Execution of 'make install' copies header files into $(INCLUDEDIR)/NTL,
368375 # copies the library itself to $(LIBDIR)/libntl.a, and copies the
369376 # documentation files into $(DOCDIR)/NTL.
377
378 ########## Disable/enable TLS hack
379
380 NTL_DISABLE_TLS_HACK=off
381 NTL_ENABLE_TLS_HACK=off
382
383 # when building NTL with NTL_THREADS=on, if the compiler is gcc-compatible, a
384 # "TLS hack" may be used to workaround the fact that many compilers do not
385 # (correctly) implement C++11's thread_local feature. The workaround is to use
386 # gcc's more limited __thread feature, and to emulate thread_local semantics
387 # using pthread routines.
388 #
389 # "gcc-compatible" means that the "__GNUC__" macro is defined, which means the
390 # TLS hack may be used for gcc, clang, and icc compilers. The current version
391 # of NTL will enable this hack by default, but you can disable it by specifying
392 # NTL_DISABLE_TLS_HACK=on. At some point in the future, this default behavior
393 # may change, in which case you will still be able to force the TLS hack by
394 # specifying NTL_ENABLE_TLS_HACK=on.
395
396
397
370398
371399
372400
480508
481509 NTL_DISABLE_LONGDOUBLE=off
482510
483 # Explicitly disables us of long double arithmetic in the
484 # single-precision modular arithmetic routines
511 # Explicitly disables use of long double arithmetic
485512
486513 NTL_DISABLE_LONGLONG=off
487514
488 # Explicitly disables us of long long arithmetic in the
489 # single-precision modular arithmetic routines
515 # Explicitly disables use of long long arithmetic
516
517 NTL_DISABLE_LL_ASM=off
518
519 # Explicitly disables use of inline asm as replacement for
520 # long long arithmetic
521
522 NTL_MAXIMIZE_SP_NBITS=on
523
524 # Allows for 62-bit single-precision moduli on 64-bit platforms.
525 # By default, such moduli are restricted to 60 bits, which
526 # usually gives *slightly* better performance across a range of
527 # of parameters.
490528
491529
492530
600638
601639 # Yet another alternative implementation for GF2X multiplication.
602640
641 NTL_PCLMUL=off
642
643 # switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
644 # over GF(2)[X] (without relying on the gf2x package)
645
603646
604647
605648 ########## More GMP Options:
11 COPYRIGHT NOTICE
22
33 NTL -- A Library for Doing Number Theory
4 Copyright (C) 1996-2015 Victor Shoup
4 Copyright (C) 1996-2016 Victor Shoup
55
66 The most recent version of NTL is available at http://www.shoup.net
77
2828 distributions. In general, the individual files do not contain
2929 copyright notices.
3030
31 Note that the the file ZZ.c contains an implementation of SHA256
32 which is derived from work by Brad Conte, which is in the public domain.
33 See file ZZ.c for a more detailed notice.
34
35 Note that the file mat_lzz_p.c contains an implemention of Strassen's
36 matrix multiplication algorithm which is derived from the implementation
37 in FLINT v2.5.2. The latter is copyrighted by Martin Albrecht, William Hart,
38 and Fredrik Johansson, and also licensed under te GPL.
39 See file mat_lzz_p.c for a more detailed notice.
40
3141 Note that the quad_float package is derived from the doubledouble package,
32 originally developed by Keith Briggs, and also licensed unger the GNU GPL.
42 originally developed by Keith Briggs, and also licensed under the GNU GPL.
3343 The files quad_float.c and quad_float.h contain more detailed copyright
3444 notices.
3545
3747 from---and represents an extensive modification of---
3848 a package originally developed and copyrighted by Arjen Lenstra,
3949 who has agreed to renounce any copyright claims on the particular
40 version of the long integer package appearing in NTL, so that the
50 version of the long integer package appearing in NTL, so that
4151 this package now is covered by the GNU GPL as well.
4252
4353 Note that the alternative long integer package used by NTL is GMP,
4454 which is written by Torbjorn Granlund <tege@swox.com>.
4555 GMP is licensed under the terms of the GNU Lesser General Public License.
4656
47 Note that NTL makes use of the RSA Data Security, Inc. MD5 Message
48 Digest Algorithm.
4957
5058 Note that prior to version 4.0, NTL was distributed under the following terms:
5159 NTL is freely available for research and educational purposes.
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pEX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pEX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pEXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pEXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6464 <font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
6565 <font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
6666 <font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
67 <font color="#0000ed"><i>// form ddf-*-baby-* and ddf-*-giant-*.</i></font><br>
67 <font color="#0000ed"><i>// form tmp-*.</i></font><br>
6868 <font color="#0000ed"><i>// The definition of &quot;large&quot; is controlled by the variable</i></font><br>
6969 <br>
7070 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#008b00"><b>extern</b></font>&nbsp;<font color="#008b00"><b>double</b></font>&nbsp;zz_pEXFileThresh<br>
5656 // this routine uses external files to store some intermediate
5757 // results, which are removed if the routine terminates normally.
5858 // These files are stored in the current directory under names of the
59 // form ddf-*-baby-* and ddf-*-giant-*.
59 // form tmp-*.
6060 // The definition of "large" is controlled by the variable
6161
6262 extern double zz_pEXFileThresh
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pX.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pX.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
725725 <font color="#0000ed"><i>// and the power projection and minimal polynomial routines below, </i></font><br>
726726 <font color="#0000ed"><i>// and indirectly affects many routines in zz_pXFactoring.</i></font><br>
727727 <br>
728 <br>
729 <a name="compmod"></a>
730 <br>
731 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
732 <br>
733 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Faster Composition with Pre-Conditioning</i></font><br>
734 <br>
735 <font color="#0000ed"><i>A new, experimental version of composition with preconditioning.</i></font><br>
736 <font color="#0000ed"><i>This interface was introduced in NTL v9.6.3, and it should be </i></font><br>
737 <font color="#0000ed"><i>considered a preliminary interface and suvject to change (although</i></font><br>
738 <font color="#0000ed"><i>it is likely to not change very much).</i></font><br>
739 <br>
740 <font color="#0000ed"><i>Usage:</i></font><br>
741 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;zz_pX x, g, h;</i></font><br>
742 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;zz_pXModulus F;</i></font><br>
743 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;zz_pXArgument H;</i></font><br>
744 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;build(H, h, F);</i></font><br>
745 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;zz_pXAltArgument H1;</i></font><br>
746 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;build(H1, H, F);&nbsp;&nbsp;// this keeps a pointer to H, so H must remain alive</i></font><br>
747 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;CompMod(x, g, H1, F);&nbsp;&nbsp;// x = g(h) mod f</i></font><br>
748 <br>
749 <font color="#0000ed"><i>The idea is that H1 stores the data in H in an alternative format</i></font><br>
750 <font color="#0000ed"><i>that allows for a more cache-friendly and more efficient execution</i></font><br>
751 <font color="#0000ed"><i>of CompMod.&nbsp;&nbsp;Depending on a variety of factors, this can be up to </i></font><br>
752 <font color="#0000ed"><i>about 3x faster than the redgular CompMod.</i></font><br>
753 <br>
754 <br>
755 <font color="#0000ed"><i>\*************************************************************************</i></font><font color="#0000ed"><i>*/</i></font><br>
756 <br>
757 <font color="#008b00"><b>class</b></font>&nbsp;&nbsp;zz_pXAltArgument { <br>
758 &nbsp;<font color="#0000ed"><i>// ...</i></font><br>
759 };<br>
760 <br>
761 <font color="#008b00"><b>void</b></font>&nbsp;build(zz_pXAltArgument&amp; altH, <font color="#008b00"><b>const</b></font>&nbsp;zz_pXArgument&amp; H, <font color="#008b00"><b>const</b></font>&nbsp;zz_pXModulus&amp; F);<br>
762 <font color="#008b00"><b>void</b></font>&nbsp;CompMod(zz_pX&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;zz_pX&amp; g, <font color="#008b00"><b>const</b></font>&nbsp;zz_pXAltArgument&amp; A, <br>
763 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;zz_pXModulus&amp; F);<br>
764 <br>
765 <br>
766 <br>
728767 <font color="#0000ed"><i>/*</i></font><font color="#0000ed"><i>*************************************************************************\</i></font><br>
729768 <br>
730769 <font color="#0000ed"><i>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; power projection routines</i></font><br>
717717 // and the power projection and minimal polynomial routines below,
718718 // and indirectly affects many routines in zz_pXFactoring.
719719
720
721 // @anchor{compmod}
722
723 /**************************************************************************\
724
725 Faster Composition with Pre-Conditioning
726
727 A new, experimental version of composition with preconditioning.
728 This interface was introduced in NTL v9.6.3, and it should be
729 considered a preliminary interface and suvject to change (although
730 it is likely to not change very much).
731
732 Usage:
733 zz_pX x, g, h;
734 zz_pXModulus F;
735 zz_pXArgument H;
736 build(H, h, F);
737 zz_pXAltArgument H1;
738 build(H1, H, F); // this keeps a pointer to H, so H must remain alive
739 CompMod(x, g, H1, F); // x = g(h) mod f
740
741 The idea is that H1 stores the data in H in an alternative format
742 that allows for a more cache-friendly and more efficient execution
743 of CompMod. Depending on a variety of factors, this can be up to
744 about 3x faster than the redgular CompMod.
745
746
747 \**************************************************************************/
748
749 class zz_pXAltArgument {
750 // ...
751 };
752
753 void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F);
754 void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
755 const zz_pXModulus& F);
756
757
758
720759 /**************************************************************************\
721760
722761 power projection routines
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pXFactoring.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pXFactoring.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_GF2.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_GF2.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6767 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
6868 <br>
6969 <font color="#008b00"><b>void</b></font>&nbsp;solve(GF2&amp; d, vec_GF2&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_GF2&amp; b);<br>
70 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = det(A).&nbsp;&nbsp;</i></font><br>
70 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
7171 <font color="#0000ed"><i>// If d != 0, solves x*A = b. </i></font><br>
72 <br>
73 <font color="#008b00"><b>void</b></font>&nbsp;solve(GF2&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2&amp; A, vec_GF2&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_GF2&amp; b);<br>
74 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
75 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
7276 <br>
7377 <font color="#008b00"><b>void</b></font>&nbsp;inv(GF2&amp; d, mat_GF2&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2&amp; A);<br>
7478 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = det(A).&nbsp;&nbsp;If d != 0,</i></font><br>
5959 // X = transpose of A
6060
6161 void solve(GF2& d, vec_GF2& x, const mat_GF2& A, const vec_GF2& b);
62 // A is an n x n matrix, b is a length n vector. Computes d = det(A).
62 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
6363 // If d != 0, solves x*A = b.
64
65 void solve(GF2& d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b);
66 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
67 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
6468
6569 void inv(GF2& d, mat_GF2& X, const mat_GF2& A);
6670 // A is an n x n matrix. Computes d = det(A). If d != 0,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_GF2E.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_GF2E.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6363 mat_GF2E transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_GF2E&amp; A);<br>
6464 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
6565 <br>
66 <font color="#008b00"><b>void</b></font>&nbsp;solve(GF2E&amp; d, vec_GF2E&amp; X,<br>
67 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2E&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_GF2E&amp; b);<br>
68 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d =</i></font><br>
69 <font color="#0000ed"><i>// determinant(A).&nbsp;&nbsp;If d != 0, solves x*A = b.</i></font><br>
66 <font color="#008b00"><b>void</b></font>&nbsp;solve(GF2E&amp; d, vec_GF2E&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2E&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_GF2E&amp; b);<br>
67 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
68 <font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
69 <br>
70 <font color="#008b00"><b>void</b></font>&nbsp;solve(GF2E&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2E&amp; A, vec_GF2E&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_GF2E&amp; b);<br>
71 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
72 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
7073 <br>
7174 <font color="#008b00"><b>void</b></font>&nbsp;inv(GF2E&amp; d, mat_GF2E&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_GF2E&amp; A);<br>
7275 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = determinant(A).&nbsp;&nbsp;If d != 0,</i></font><br>
5555 mat_GF2E transpose(const mat_GF2E& A);
5656 // X = transpose of A
5757
58 void solve(GF2E& d, vec_GF2E& X,
59 const mat_GF2E& A, const vec_GF2E& b);
60 // A is an n x n matrix, b is a length n vector. Computes d =
61 // determinant(A). If d != 0, solves x*A = b.
58 void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b);
59 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
60 // If d != 0, solves x*A = b.
61
62 void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b);
63 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
64 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
6265
6366 void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A);
6467 // A is an n x n matrix. Computes d = determinant(A). If d != 0,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_RR.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_RR.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
5959 mat_ZZ_p transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_p&amp; A);<br>
6060 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
6161 <br>
62 <font color="#008b00"><b>void</b></font>&nbsp;solve(ZZ_p&amp; d, vec_ZZ_p&amp; X,<br>
63 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_ZZ_p&amp; b);<br>
64 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d =</i></font><br>
65 <font color="#0000ed"><i>// determinant(A).&nbsp;&nbsp;If d != 0, solves x*A = b.</i></font><br>
62 <font color="#008b00"><b>void</b></font>&nbsp;solve(ZZ_p&amp; d, vec_ZZ_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_ZZ_p&amp; b);<br>
63 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
64 <font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
65 <br>
66 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_p&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, vec_zz_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b);<br>
67 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
68 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
6669 <br>
6770 <font color="#008b00"><b>void</b></font>&nbsp;inv(ZZ_p&amp; d, mat_ZZ_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_p&amp; A);<br>
6871 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = determinant(A).&nbsp;&nbsp;If d != 0,</i></font><br>
5151 mat_ZZ_p transpose(const mat_ZZ_p& A);
5252 // X = transpose of A
5353
54 void solve(ZZ_p& d, vec_ZZ_p& X,
55 const mat_ZZ_p& A, const vec_ZZ_p& b);
56 // A is an n x n matrix, b is a length n vector. Computes d =
57 // determinant(A). If d != 0, solves x*A = b.
54 void solve(ZZ_p& d, vec_ZZ_p& x, const mat_ZZ_p& A, const vec_ZZ_p& b);
55 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
56 // If d != 0, solves x*A = b.
57
58 void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b);
59 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
60 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
5861
5962 void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A);
6063 // A is an n x n matrix. Computes d = determinant(A). If d != 0,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6161 mat_ZZ_pE transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_pE&amp; A);<br>
6262 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
6363 <br>
64 <font color="#008b00"><b>void</b></font>&nbsp;solve(ZZ_pE&amp; d, vec_ZZ_pE&amp; X,<br>
65 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_pE&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_ZZ_pE&amp; b);<br>
66 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d =</i></font><br>
67 <font color="#0000ed"><i>// determinant(A).&nbsp;&nbsp;If d != 0, solves x*A = b.</i></font><br>
64 <font color="#008b00"><b>void</b></font>&nbsp;solve(ZZ_pE&amp; d, vec_ZZ_pE&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_pE&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_ZZ_pE&amp; b);<br>
65 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
66 <font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
67 <br>
68 <font color="#008b00"><b>void</b></font>&nbsp;solve(ZZ_pE&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_pE&amp; A, vec_ZZ_pE&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_ZZ_pE&amp; b);<br>
69 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
70 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
6871 <br>
6972 <font color="#008b00"><b>void</b></font>&nbsp;inv(ZZ_pE&amp; d, mat_ZZ_pE&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_ZZ_pE&amp; A);<br>
7073 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = determinant(A).&nbsp;&nbsp;If d != 0,</i></font><br>
5353 mat_ZZ_pE transpose(const mat_ZZ_pE& A);
5454 // X = transpose of A
5555
56 void solve(ZZ_pE& d, vec_ZZ_pE& X,
57 const mat_ZZ_pE& A, const vec_ZZ_pE& b);
58 // A is an n x n matrix, b is a length n vector. Computes d =
59 // determinant(A). If d != 0, solves x*A = b.
56 void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b);
57 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
58 // If d != 0, solves x*A = b.
59
60 void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b);
61 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
62 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
6063
6164 void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A);
6265 // A is an n x n matrix. Computes d = determinant(A). If d != 0,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_lzz_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_lzz_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
1313 <font color="#0000ed"><i>SUMMARY:</i></font><br>
1414 <br>
1515 <font color="#0000ed"><i>Defines the class mat_zz_p.</i></font><br>
16 <font color="#0000ed"><i>Note that the modulus p need not be a prime, except as indicated below.</i></font><br>
17 <br>
18 <font color="#0000ed"><i>IMPLEMENTATION NOTES: </i></font><br>
19 <br>
20 <font color="#0000ed"><i>Starting with NTL version 9.7.0 (and 9.7.1), many of the routines here have</i></font><br>
21 <font color="#0000ed"><i>been optimized to take better advantage of specific hardware features available</i></font><br>
22 <font color="#0000ed"><i>on 64-bit Intel CPU's.&nbsp;&nbsp;Currently, the mul, inv, determinant, solve, gauss,</i></font><br>
23 <font color="#0000ed"><i>kernel, and image routines are fastest for p up to 23-bits long (assuming the</i></font><br>
24 <font color="#0000ed"><i>CPU supports AVX instructions).&nbsp;&nbsp;After that, performance degrades in three</i></font><br>
25 <font color="#0000ed"><i>stages: stage 1: up to 28-bits; stage 2: up to 31-bits; stage 3: 32-bits and</i></font><br>
26 <font color="#0000ed"><i>up. </i></font><br>
27 <br>
28 <font color="#0000ed"><i>For primes up to 23-bits, AVX floating point instructions are used.&nbsp;&nbsp;After</i></font><br>
29 <font color="#0000ed"><i>that, ordinary integer arithmetic is used.&nbsp;&nbsp;In a future version, I may exploit</i></font><br>
30 <font color="#0000ed"><i>AVX2 integer instructions to get better stage 2 performance.&nbsp;&nbsp;And in the more</i></font><br>
31 <font color="#0000ed"><i>distant future, AVX512 instructions will be used, when they become available.</i></font><br>
32 <br>
33 <font color="#0000ed"><i>On older Intel machines, or non-Intel machines that have &quot;long long&quot; support,</i></font><br>
34 <font color="#0000ed"><i>one still gets optimizations corresponding to the three stages above.&nbsp;&nbsp;On</i></font><br>
35 <font color="#0000ed"><i>32-bit machines, one still gets three stages, just with smaller crossover</i></font><br>
36 <font color="#0000ed"><i>points.</i></font><br>
1637 <br>
1738 <font color="#0000ed"><i>\*************************************************************************</i></font><font color="#0000ed"><i>*/</i></font><br>
1839 <br>
4768 <font color="#0000ed"><i>// X = a * B</i></font><br>
4869 <br>
4970 <br>
71 <font color="#008b00"><b>void</b></font>&nbsp;transpose(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
72 mat_zz_p transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
73 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
74 <br>
75 <br>
5076 <font color="#008b00"><b>void</b></font>&nbsp;determinant(zz_p&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
5177 zz_p determinant(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; a); <br>
5278 <font color="#0000ed"><i>// d = determinant(A)</i></font><br>
5379 <br>
54 <br>
55 <font color="#008b00"><b>void</b></font>&nbsp;transpose(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
56 mat_zz_p transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
57 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
58 <br>
59 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_p&amp; d, vec_zz_p&amp; X,<br>
60 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b);<br>
61 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d =</i></font><br>
62 <font color="#0000ed"><i>// determinant(A).&nbsp;&nbsp;If d != 0, solves x*A = b.</i></font><br>
80 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_p&amp; d, vec_zz_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b);<br>
81 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
82 <font color="#0000ed"><i>// If d != 0, solves x*A = b (so x and b are treated as a row vectors).</i></font><br>
83 <br>
84 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_p&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, vec_zz_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b);<br>
85 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
86 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
6387 <br>
6488 <font color="#008b00"><b>void</b></font>&nbsp;inv(zz_p&amp; d, mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
6589 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = determinant(A).&nbsp;&nbsp;If d != 0,</i></font><br>
6690 <font color="#0000ed"><i>// computes X = A^{-1}.</i></font><br>
6791 <br>
92 <br>
93 <font color="#008b00"><b>void</b></font>&nbsp;inv(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
94 mat_zz_p inv(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
95 <font color="#0000ed"><i>// X = A^{-1}; error is raised if A is&nbsp;&nbsp;singular</i></font><br>
96 <br>
97 <font color="#008b00"><b>void</b></font>&nbsp;power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e);<br>
98 mat_zz_p power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e);<br>
99 <font color="#008b00"><b>void</b></font>&nbsp;power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e);<br>
100 mat_zz_p power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e);<br>
101 <font color="#0000ed"><i>// X = A^e; e may be negative (in which case A must be nonsingular).</i></font><br>
102 <br>
103 <font color="#0000ed"><i>// NOTE: the routines determinant, solve, inv, and power (with negative</i></font><br>
104 <font color="#0000ed"><i>// exponent) all require that the modulus p is prime: during elimination, if a</i></font><br>
105 <font color="#0000ed"><i>// non-zero pivot element does not have an inverse, and error is raised.&nbsp;&nbsp;The</i></font><br>
106 <font color="#0000ed"><i>// following &quot;relaxed&quot; versions of these routines will also work with prime</i></font><br>
107 <font color="#0000ed"><i>// powers, if the optional parameter relax is true (which is the default).</i></font><br>
108 <font color="#0000ed"><i>// However, note that in these relaxed routines, if a computed determinant</i></font><br>
109 <font color="#0000ed"><i>// value is zero, this may not be the true determinant: all that you can assume</i></font><br>
110 <font color="#0000ed"><i>// is that the true determinant is is not invertible mod p. If the parameter</i></font><br>
111 <font color="#0000ed"><i>// relax==false, then these routines behave identically to their &quot;unrelaxed&quot;</i></font><br>
112 <font color="#0000ed"><i>// counterparts.</i></font><br>
113 <br>
114 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_determinant(zz_p&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
115 zz_p relaxed_determinant(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; a, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>); <br>
116 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_solve(zz_p&amp; d, vec_zz_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
117 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_solve(zz_p&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, vec_zz_p&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_p&amp; b, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
118 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_inv(zz_p&amp; d, mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
119 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_inv(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
120 mat_zz_p relaxed_inv(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
121 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
122 mat_zz_p relaxed_power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
123 <font color="#008b00"><b>void</b></font>&nbsp;relaxed_power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
124 mat_zz_p relaxed_power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e, <font color="#008b00"><b>bool</b></font>&nbsp;relax=<font color="#cc0000">true</font>);<br>
125 <br>
126 <br>
68127 <font color="#008b00"><b>void</b></font>&nbsp;sqr(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
69128 mat_zz_p sqr(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
70129 <font color="#0000ed"><i>// X = A*A&nbsp;&nbsp; </i></font><br>
71 <br>
72 <font color="#008b00"><b>void</b></font>&nbsp;inv(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
73 mat_zz_p inv(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A);<br>
74 <font color="#0000ed"><i>// X = A^{-1}; error is raised if A is&nbsp;&nbsp;singular</i></font><br>
75 <br>
76 <font color="#008b00"><b>void</b></font>&nbsp;power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e);<br>
77 mat_zz_p power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;ZZ&amp; e);<br>
78 <br>
79 <font color="#008b00"><b>void</b></font>&nbsp;power(mat_zz_p&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e);<br>
80 mat_zz_p power(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_p&amp; A, <font color="#008b00"><b>long</b></font>&nbsp;e);<br>
81 <font color="#0000ed"><i>// X = A^e; e may be negative (in which case A must be nonsingular).</i></font><br>
82 <br>
83130 <br>
84131 <font color="#008b00"><b>void</b></font>&nbsp;ident(mat_zz_p&amp; X, <font color="#008b00"><b>long</b></font>&nbsp;n);<br>
85132 mat_zz_p ident_mat_zz_p(<font color="#008b00"><b>long</b></font>&nbsp;n);<br>
112159 <font color="#0000ed"><i>// Computes a basis for the kernel of the map x -&gt; x*A. where x is a</i></font><br>
113160 <font color="#0000ed"><i>// row vector.</i></font><br>
114161 <br>
162 <font color="#0000ed"><i>// NOTE: the gauss, image, and kernel routines all require that</i></font><br>
163 <font color="#0000ed"><i>// the modulus p is prime. </i></font><br>
164 <br>
115165 <br>
116166 <br>
117167 <font color="#0000ed"><i>// miscellaneous:</i></font><br>
55 SUMMARY:
66
77 Defines the class mat_zz_p.
8 Note that the modulus p need not be a prime, except as indicated below.
9
10 IMPLEMENTATION NOTES:
11
12 Starting with NTL version 9.7.0 (and 9.7.1), many of the routines here have
13 been optimized to take better advantage of specific hardware features available
14 on 64-bit Intel CPU's. Currently, the mul, inv, determinant, solve, gauss,
15 kernel, and image routines are fastest for p up to 23-bits long (assuming the
16 CPU supports AVX instructions). After that, performance degrades in three
17 stages: stage 1: up to 28-bits; stage 2: up to 31-bits; stage 3: 32-bits and
18 up.
19
20 For primes up to 23-bits, AVX floating point instructions are used. After
21 that, ordinary integer arithmetic is used. In a future version, I may exploit
22 AVX2 integer instructions to get better stage 2 performance. And in the more
23 distant future, AVX512 instructions will be used, when they become available.
24
25 On older Intel machines, or non-Intel machines that have "long long" support,
26 one still gets optimizations corresponding to the three stages above. On
27 32-bit machines, one still gets three stages, just with smaller crossover
28 points.
829
930 \**************************************************************************/
1031
3960 // X = a * B
4061
4162
63 void transpose(mat_zz_p& X, const mat_zz_p& A);
64 mat_zz_p transpose(const mat_zz_p& A);
65 // X = transpose of A
66
67
4268 void determinant(zz_p& d, const mat_zz_p& A);
4369 zz_p determinant(const mat_zz_p& a);
4470 // d = determinant(A)
4571
46
47 void transpose(mat_zz_p& X, const mat_zz_p& A);
48 mat_zz_p transpose(const mat_zz_p& A);
49 // X = transpose of A
50
51 void solve(zz_p& d, vec_zz_p& X,
52 const mat_zz_p& A, const vec_zz_p& b);
53 // A is an n x n matrix, b is a length n vector. Computes d =
54 // determinant(A). If d != 0, solves x*A = b.
72 void solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b);
73 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
74 // If d != 0, solves x*A = b (so x and b are treated as a row vectors).
75
76 void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b);
77 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
78 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
5579
5680 void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A);
5781 // A is an n x n matrix. Computes d = determinant(A). If d != 0,
5882 // computes X = A^{-1}.
5983
84
85 void inv(mat_zz_p& X, const mat_zz_p& A);
86 mat_zz_p inv(const mat_zz_p& A);
87 // X = A^{-1}; error is raised if A is singular
88
89 void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
90 mat_zz_p power(const mat_zz_p& A, const ZZ& e);
91 void power(mat_zz_p& X, const mat_zz_p& A, long e);
92 mat_zz_p power(const mat_zz_p& A, long e);
93 // X = A^e; e may be negative (in which case A must be nonsingular).
94
95 // NOTE: the routines determinant, solve, inv, and power (with negative
96 // exponent) all require that the modulus p is prime: during elimination, if a
97 // non-zero pivot element does not have an inverse, and error is raised. The
98 // following "relaxed" versions of these routines will also work with prime
99 // powers, if the optional parameter relax is true (which is the default).
100 // However, note that in these relaxed routines, if a computed determinant
101 // value is zero, this may not be the true determinant: all that you can assume
102 // is that the true determinant is is not invertible mod p. If the parameter
103 // relax==false, then these routines behave identically to their "unrelaxed"
104 // counterparts.
105
106 void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax=true);
107 zz_p relaxed_determinant(const mat_zz_p& a, bool relax=true);
108 void relaxed_solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b, bool relax=true);
109 void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax=true);
110 void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax=true);
111 void relaxed_inv(mat_zz_p& X, const mat_zz_p& A, bool relax=true);
112 mat_zz_p relaxed_inv(const mat_zz_p& A, bool relax=true);
113 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax=true);
114 mat_zz_p relaxed_power(const mat_zz_p& A, const ZZ& e, bool relax=true);
115 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, long e, bool relax=true);
116 mat_zz_p relaxed_power(const mat_zz_p& A, long e, bool relax=true);
117
118
60119 void sqr(mat_zz_p& X, const mat_zz_p& A);
61120 mat_zz_p sqr(const mat_zz_p& A);
62121 // X = A*A
63
64 void inv(mat_zz_p& X, const mat_zz_p& A);
65 mat_zz_p inv(const mat_zz_p& A);
66 // X = A^{-1}; error is raised if A is singular
67
68 void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
69 mat_zz_p power(const mat_zz_p& A, const ZZ& e);
70
71 void power(mat_zz_p& X, const mat_zz_p& A, long e);
72 mat_zz_p power(const mat_zz_p& A, long e);
73 // X = A^e; e may be negative (in which case A must be nonsingular).
74
75122
76123 void ident(mat_zz_p& X, long n);
77124 mat_zz_p ident_mat_zz_p(long n);
104151 // Computes a basis for the kernel of the map x -> x*A. where x is a
105152 // row vector.
106153
154 // NOTE: the gauss, image, and kernel routines all require that
155 // the modulus p is prime.
156
107157
108158
109159 // miscellaneous:
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_lzz_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_lzz_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
6161 mat_zz_pE transpose(<font color="#008b00"><b>const</b></font>&nbsp;mat_zz_pE&amp; A);<br>
6262 <font color="#0000ed"><i>// X = transpose of A</i></font><br>
6363 <br>
64 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_pE&amp; d, vec_zz_pE&amp; X,<br>
65 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_pE&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_pE&amp; b);<br>
64 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_pE&amp; d, vec_zz_pE&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_pE&amp; A, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_pE&amp; b);<br>
6665 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d =</i></font><br>
6766 <font color="#0000ed"><i>// determinant(A).&nbsp;&nbsp;If d != 0, solves x*A = b.</i></font><br>
67 <br>
68 <font color="#008b00"><b>void</b></font>&nbsp;solve(zz_pE&amp; d, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_pE&amp; A, vec_zz_pE&amp; x, <font color="#008b00"><b>const</b></font>&nbsp;vec_zz_pE&amp; b);<br>
69 <font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.&nbsp;&nbsp;Computes d = determinant(A).</i></font><br>
70 <font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
6871 <br>
6972 <font color="#008b00"><b>void</b></font>&nbsp;inv(zz_pE&amp; d, mat_zz_pE&amp; X, <font color="#008b00"><b>const</b></font>&nbsp;mat_zz_pE&amp; A);<br>
7073 <font color="#0000ed"><i>// A is an n x n matrix.&nbsp;&nbsp;Computes d = determinant(A).&nbsp;&nbsp;If d != 0,</i></font><br>
5353 mat_zz_pE transpose(const mat_zz_pE& A);
5454 // X = transpose of A
5555
56 void solve(zz_pE& d, vec_zz_pE& X,
57 const mat_zz_pE& A, const vec_zz_pE& b);
56 void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b);
5857 // A is an n x n matrix, b is a length n vector. Computes d =
5958 // determinant(A). If d != 0, solves x*A = b.
59
60 void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b);
61 // A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
62 // If d != 0, solves A*x = b (so x and b are treated as a column vectors).
6063
6164 void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A);
6265 // A is an n x n matrix. Computes d = determinant(A). If d != 0,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_ZZ.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_ZZ.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_ZZ_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_ZZ_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_lzz_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_lzz_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/matrix.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/matrix.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/pair.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/pair.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/quad_float.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/quad_float.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/tools.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/tools.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
7575 <br>
7676 <font color="#008b00"><b>long</b></font>&nbsp;min(<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>int</b></font>&nbsp;b);<br>
7777 <font color="#008b00"><b>long</b></font>&nbsp;max(<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>int</b></font>&nbsp;b);<br>
78 <br>
79 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;min(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;b);<br>
80 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;max(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;b);<br>
81 <br>
82 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;min(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;b);<br>
83 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;max(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;b);<br>
84 <br>
85 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;min(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;b);<br>
86 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;max(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;b);<br>
87 <br>
88 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;min(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;b);<br>
89 <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;max(<font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>long</b></font>&nbsp;a, <font color="#008b00"><b>unsigned</b></font>&nbsp;<font color="#008b00"><b>int</b></font>&nbsp;b);<br>
90 <br>
7891 <br>
7992 <font color="#008b00"><b>void</b></font>&nbsp;swap(<font color="#008b00"><b>long</b></font>&amp; a, <font color="#008b00"><b>long</b></font>&amp; b);<br>
8093 <font color="#008b00"><b>void</b></font>&nbsp;swap(<font color="#008b00"><b>int</b></font>&amp; a, <font color="#008b00"><b>int</b></font>&amp; b);<br>
6767
6868 long min(long a, int b);
6969 long max(long a, int b);
70
71 unsigned int min(unsigned int a, unsigned int b);
72 unsigned int max(unsigned int a, unsigned int b);
73
74 unsigned long min(unsigned long a, unsigned long b);
75 unsigned long max(unsigned long a, unsigned long b);
76
77 unsigned long min(unsigned int a, unsigned long b);
78 unsigned long max(unsigned int a, unsigned long b);
79
80 unsigned long min(unsigned long a, unsigned int b);
81 unsigned long max(unsigned long a, unsigned int b);
82
7083
7184 void swap(long& a, long& b);
7285 void swap(int& a, int& b);
1414 A Tour of NTL: Summary of Changes
1515 </p>
1616 </h1>
17
18
19 <p><hr><p>
20 <h3>
21 2016.05.30: Changes between NTL 9.8.1 and 9.9.0
22 </h3>
23
24 <ul>
25 <li>
26 Added
27 examples on how to use documentation on NTL's thread pools
28 and parallel for loops:
29 <a href="tour-ex7.html">see here</a>
30 <li>
31 The build procedure now puts files <tt>config_log.h</tt>
32 and <tt>wizard_log.h</tt>
33 in NTL's include directory.
34 These files contain comments that document what choices were
35 made during the build process,
36 including the <tt>CXXAUTOFLAGS</tt> value.
37 <li>
38 Added <tt>elts()</tt> method to <tt>UniqueArray</tt> and <tt>AlignedArray</tt>
39 (for compatibility with <tt>Vec</tt> class)
40
41 <li>
42 Added <tt>get()</tt> and <tt>release()</tt> methods to <tt>OptionalVal</tt>
43
44 <li>
45 Made constructors for <tt>PartitionInfo</tt> and <tt>BasicThreadPool</tt>
46 <i>explicit</i>
47
48 <li>
49 Cleaned up some pointer issues in <tt>mat_lzz_p.c</tt> (mainly academic)
50
51 <li>
52 Definition of <tt>NTL_TLS_LOCAL_INIT</tt> ensures that var names
53 a local reference, regardless of the implementation
54
55 <li>
56 Allow <tt>p.move(q)</tt>, where <tt>p</tt> is <tt>a UniquePtr&lt;T&gt;</tt>,
57 <tt>q</tt> is a <tt>UniquePtr&lt;Y&gt;</tt>,
58 and <tt>Y*</tt> converts to <tt>T*</tt>.
59
60 <li>
61 Introduced <tt>PreconditionedRemainder</tt> class
62 for faster reduction of a <tt>ZZ</tt> modulo a fixed long.
63 This is intended to make Chinese Remaindering type computations faster
64 <ul>
65 <li>
66 for the time being,
67 this is an undocumented feature which may be modified or removed
68 in a future release
69 </ul>
70
71 <li>
72 Introduced <tt>ll_type</tt> and related routines which perform
73 a restricted set of operations on a long-long-like type.
74 It can be implemented via inline asm, and is a cleaner
75 interface and sometimes faster.
76 On x86-64/gcc platforms, the assembly code version is
77 used and gives a modest speed boost.
78 For all other platforms (including x86-64 with clang or icc),
79 the assembly code is not used.
80 I should really dynamically enable the assembly via the performance
81 tuning wizard, but I don't do this yet.
82 To explicitly disable the assembly code,
83 configure with <tt>NTL_DISABLE_LL_ASM=on</tt>.
84 <ul>
85 <li>
86 for the time being,
87 this is an undocumented feature which may be modified or removed
88 in a future release
89 </ul>
90
91 </ul>
92
93
94 <p><hr><p>
95 <h3>
96 2016.04.29: Changes between NTL 9.8.0 and 9.8.1
97 </h3>
98
99 <ul>
100 <li>
101 Fixed an annoying issue that could cause a unnecessary
102 ambiguities in client code when compiling with <tt>NTL_EXCEPTIONS=on</tt>
103 </ul>
104
105
106 <p><hr><p>
107 <h3>
108 2016.04.26: Changes between NTL 9.7.1 and 9.8.0
109 </h3>
110
111 <ul>
112 <p> <li>
113 <b>Thread safety for the masses!</b>
114
115 <ul>
116 <li>
117 Previous versions of NTL required full <tt>C++11</tt>
118 compliance to achieve thread safety
119 <li>
120 Unfortunately, many platforms (notably, Mac OSX)
121 do not provide the necessary
122 features - in particular, they do not provide full, correct support
123 for "thread local storage" (TLS)
124 <li>
125 This new release (by default) will apply a "TLS hack"
126 that works around this limitation (at least for
127 gcc and gcc-compatible compilers such as clang and icc)
128 <ul>
129 <li>
130 With this "hack", it is only required that gcc's
131 more widely available <tt>__thread</tt>
132 storage specifier be implemented, rather than the less widely available
133 <tt>thread_local</tt> specifier (and it also makes direct use
134 of the <tt>pthread</tt> library)
135 <li>
136 You can explicitly disable the hack by configuring NTL
137 with <tt>NTL_DISABLE_TLS_HACK=on</tt>
138 </ul>
139 <li>
140 This "hack" has been successfully
141 tested on Linux with gcc 4.8.5
142 and on Mac OSX 10.10 and 10.11 with clang
143 <ul>
144 <li>
145 It should work with any gcc 4.8.x or higher
146 <li>
147 Many thanks to Justin Walker for pushing this issue and
148 helping with the Mac OSX testing
149 </ul>
150 </ul>
151
152 <li><p>
153 Fixed a "pseudo" bug in the test script: <tt>BitMatTest</tt>
154 in <tt> make check</tt> was reporting "failure", but this was
155 a bug in <tt>BitMatTest</tt>, not in NTL itself.
156
157 <li>
158 Fixed a real bug in the <tt>ReleaseThreadPool</tt>
159 function (although NTL internally does not use this function,
160 so only client code that called it directly would be affected).
161
162
163 </ul>
164
165 <p><hr><p>
166 <h3>
167 2016.04.20: Changes between NTL 9.7.0 and 9.7.1
168 </h3>
169
170 <ul>
171
172 <li>
173 Extended the performance improvements in
174 <a href="mat_lzz_p.cpp.html">mat_lzz_p</a>
175 to include the <tt>gauss</tt>, <tt>kernel</tt>,
176 and <tt>image</tt> routines
177
178 <li>
179 Generally improved
180 performance for all of the <a href="mat_lzz_p.cpp.html">mat_lzz_p</a>,
181 including an implementation of Strassen for matrix multiplication.
182
183 <li>
184 Added the matrix/column vector <tt>solve</tt> routines
185 to all other matrix classes (for consistency).
186
187 <p>
188 <li>
189 Fixed a compile-time bug that occured on certain platforms
190 (mainly Windows).
191
192 <li>
193 Made some of the steps in <tt>configure</tt> and <tt>make</tt>
194 a bit more quiet (look at <tt>.log</tt> files for outputs).
195
196 </ul>
197
198
199 <p><hr><p>
200 <h3>
201 2016.03.12: Changes between NTL 9.6.4 and 9.7.0
202 </h3>
203
204 <ul>
205
206
207
208 <p>
209 <li>
210 Changes to <a href="mat_lzz_p.cpp.html">mat_lzz_p</a> module:
211 <ul>
212 <li>
213 Improved performance of <tt>mul</tt>, <tt>inv</tt>, <tt>solve</tt>,
214 and <tt>determinant</tt> routines:
215 <ul>
216 <li>
217 more cache friendly
218 <li>
219 thread boosted
220 <li>
221 for small p (up to 23 bits), exploits
222 AVX and FMA instructions (when available)
223 <li>
224 depending on many things,
225 the new code can be anywhere between
226 1.5x and 70x (!) times faster than the old code
227 (part of that speedup up can be attributed to just how
228 awful some of the old code was, rather than
229 how brilliant the new code is)
230 <li>
231 on the SandyBridge and Haswell machines I was able to test,
232 the new code is comparable in speed
233 to
234 <a href=" https://linbox-team.github.io/fflas-ffpack/">FFLAS/FFPACK</a>
235 </ul>
236 <li>
237 Added "relaxed" versions of <tt>inv</tt>, <tt>solve</tt>, and
238 <tt>determinant</tt>,
239 which also now work for prime powers, not just primes
240 <li>
241 Added a new variant of <tt>solve</tt> routine to solve <tt>A*x = b</tt>
242 for column vectors
243 </ul>
244
245 <p>
246 <li>Changes to <a href="BasicThreadPool.cpp.html">BasicThreadPool</a>
247 module:
248 <ul>
249 <li>
250 Added <tt>NTL_EXEC_RANGE</tt> and other functionality which makes writing
251 "parallel for loops" simple (very similar to OpenMP),
252 and the same source code will work regardless of whether
253 threads or thread boosting is enabled.
254
255 <li>
256 Backward incompatibilities:
257 <ul>
258 <li>
259 <tt>NTLThreadPool</tt> is no longer directly accessible:
260 new access functions are provided
261 <li>
262 Got rid of method <tt>SplitProblems</tt>, and made a more general/abstract
263 class <tt>PartitionInfo</tt>
264 </ul>
265 </ul>
266
267
268 <p>
269 <li>
270 Miscellaneous:
271 <ul>
272 <li>
273 Improved crossover points for <tt>GF2X</tt> division
274
275 <li>
276 Made access to thread local variables used in NTL faster
277 by using GCC's <tt>__thread</tt> in place of <tt>thread_local</tt>,
278 wherever possible
279
280 <li>
281 Improved performance of <tt>vec_long</tt> to <tt>vec_zz_p</tt> conversion
282
283 <li>
284 Made AVX and FMA detection more robust, requiring LP64
285
286 <li>
287 Added <tt>InvModStatus</tt> for <tt>long</tt>'s
288
289 <li>
290 Bumped <tt>FILE_THRESH</tt> to 1e12
291 </ul>
292 </ul>
293
294 <p><hr><p>
295 <h3>
296 2016.01.30: Changes between NTL 9.6.3 and 9.6.4
297 </h3>
298
299 <ul>
300 <li>
301 Streamlined some of the installation scripts,
302 so now the "heurstic selection of compiler flags"
303 and the "nonstandard feature testing" procedures are more structured
304 so as to be easier to extend in the future -- it is beginning to
305 act more like a sort of "autoconf".
306 <li>
307 Fixed a couple of "buglets" in the header files.
308 </ul>
309
310
311 <p><hr><p>
312 <h3>
313 2016.01.26: Changes between NTL 9.6.2 and 9.6.3
314 </h3>
315
316 <ul>
317 <li>
318 Some changes to the installation procedure:
319 <ul>
320 <li>
321 For the Unix distribution, <tt>NTL_GMP_LIP</tt> is now
322 <i>on</i> by default, which means that by default, NTL will use
323 GMP.
324 <li>
325 By default, the configuration script will attempt a
326 "native'' build by passing <tt>-march=native</tt>
327 as a compiler flag.
328 Most modern compilers support this, but the configuration script will
329 check to make sure.
330 <li>
331 The <tt>NTL_PCLMUL</tt> flag (which enables the use of
332 Intel's PCLMUL instruction) is now automagically set by the
333 Wizard script.
334 <li>
335 The build script automatically checks for availability of Intel
336 <tt>AVX</tt> intrinsics, which may be used to better
337 optimize certain code.
338 </ul>
339 <li>
340 A new modular composition implemention for <tt>zz_pX</tt>.
341 This makes modular composition up to 3x faster, depending
342 on several factors.
343 <a href="lzz_pX.cpp.html#compmod">See here</a> for details.
344
345 <li>
346 Improved performance for polynomial factoring over <tt>zz_pX</tt>
347 using <tt>CanZass</tt>,
348 using the improved modular composition routine (above)
349 and better choice of baby step / giant step parameters.
350 This leads to a 1.1x to 1.8x speedup, depending on several factors.
351
352 <li>
353 Improved robustness of <tt>quad_float</tt> implementation:
354 it should now work correctly on platforms that are too
355 liberal in their use of FMA instructions.
356
357
358 </ul>
359
360 <p><hr><p>
361 <h3>
362 2015.11.13: Changes between NTL 9.6.1 and 9.6.2
363 </h3>
364
365 <ul>
366 <li>
367 More small tweaks and a new configuration variable:
368 <pre>
369 NTL_MAXIMIZE_SP_NBITS=off
370
371 # Allows for 62-bit single-precision moduli on 64-bit platforms.
372 # By default, such moduli are restricted to 60 bits, which
373 # usually gives *slightly* better performance across a range of
374 # of parameters.
375 </pre>
376
377 </ul>
378
379 <p><hr><p>
380 <h3>
381 2015.11.13: Changes between NTL 9.6.0 and 9.6.1
382 </h3>
383
384 <ul>
385 <li>
386 Streamlined some awkard code in <tt>g_lip_impl.h</tt>.
387 <li>
388 Made <tt>QuickTest</tt> a bit quicker.
389 <li>
390 Fixed some documentation/packaging problems.
391 </ul>
392
393 <p><hr><p>
394 <h3>
395 2015.11.10: Changes between NTL 9.5.0 and 9.6.0
396 </h3>
397
398 <ul>
399 <li>
400 More performance tuning for <tt>ZZ_pX</tt> arithmetic.
401
402 <li>
403 Added configuration variable <tt>CXXAUTOFLAGS</tt>,
404 which is dynamically (and heuristically) set by the configuration
405 script.
406 This way, <tt>CXXFLAGS</tt> is not modified by the script.
407 </ul>
408
409
410 <p><hr><p>
411 <h3>
412 2015.10.20: Changes between NTL 9.4.0 and 9.5.0
413 </h3>
414
415 <ul>
416 <li>
417 Added a new <i>thread boosting</i> feature.
418 With this feature, certain code within NTL will use available
419 threads to speed up certain computations on a multicore
420 machine.
421 This feature is enabled by setting <tt>NTL_THREAD_BOOST=on</tt>
422 during configuration.
423 See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
424 for more information.
425
426 <p>
427 This feature is a work in progress.
428 Currently, basic <tt>ZZ_pX</tt> arithmetic has been thread boosted.
429 More code will be boosted later.
430
431 <li>
432 A bit more perfomance tuning for <tt>ZZ_pX</tt> arithmetic,
433 and better crossovers for <tt>ZZX</tt> multiplcation.
434
435 </ul>
436
437 <p><hr><p>
438 <h3>
439 2015.9.22: Changes between NTL 9.3.0 and 9.4.0
440 </h3>
441
442 <ul>
443 <li>
444 Performance tuning: <tt>ZZ_pX</tt> and <tt>zz_pX</tt> keep
445 getting faster
446
447 <li>
448 Upgrade to pseudo-random number generation:
449 I replaced the underlying PRG with Chacha20 (replacing RC4)
450 and the underlying key-derivation function with a function
451 based on HMAC-SHA256 (replacing an MD5-based function).
452 The new routines are faster and more secure.
453 <p>
454 I also expanded the PRG interface a bit:
455 <a href="ZZ.cpp.html#prg">see here</a> for details.
456
457 <li>
458 Bug fixes: fixed a (mostly dormant) bug in the <tt>ZZFromBytes</tt>
459 routine (triggered only when <tt>n==0</tt>).
460
461 <li>
462 Added documentation for classes <tt>RRPush</tt> and
463 <tt>RROutputPush</tt>:
464 <a href="RR.cpp.html#push">see here</a> for details.
465
466 </ul>
467
468
469
17470
18471 <p><hr><p>
19472 <h3>
61514 This might change in the future.
62515
63516 <p>
64 For details, look <a href="ZZ.cpp.html#modarith">here</a>,
517 For details, see <a href="ZZ.cpp.html#modarith">here</a>,
65518 including the comments entitled "Compatibility notes".
66519
67520 <p>
231684 interface (although this is not recommended practice).
232685
233686 <p>
234 For details, look <a href="ZZ.cpp.html#modarith">here</a>,
687 For details, <a href="ZZ.cpp.html#modarith">see here</a>,
235688 including the comments entitled "Compatibility notes".
236689
237690 <p>
2525 #include <NTL/ZZ_pXFactoring.h>
2626 #include <NTL/ZZ_pEX.h>
2727
28 NTL_CLIENT
28 using namespace std;
29 using namespace NTL;
2930
3031 int main()
3132 {
5354 }
5455 ENDPLAIN -->
5556 <!-- STARTPRETTY {{{ -->
56 <p><p><table cellPadding=10px><tr><td><font color="#000000">
57 <font face="monospace">
58 <font color="#1874cd">#include&nbsp;</font><font color="#4a708b">&lt;NTL/ZZ_pXFactoring.h&gt;</font><br>
59 <font color="#1874cd">#include&nbsp;</font><font color="#4a708b">&lt;NTL/ZZ_pEX.h&gt;</font><br>
57 <p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
58 <font color="#1773cc">#include </font><font color="#4a6f8b">&lt;NTL/ZZ_pXFactoring.h&gt;</font><br>
59 <font color="#1773cc">#include </font><font color="#4a6f8b">&lt;NTL/ZZ_pEX.h&gt;</font><br>
6060 <br>
61 NTL_CLIENT<br>
61 <font color="#b02f60"><b>using</b></font>&nbsp;<font color="#008b00"><b>namespace</b></font>&nbsp;std;<br>
62 <font color="#b02f60"><b>using</b></font>&nbsp;<font color="#008b00"><b>namespace</b></font>&nbsp;NTL;<br>
6263 <br>
6364 <font color="#008b00"><b>int</b></font>&nbsp;main()<br>
6465 {<br>
65 &nbsp;&nbsp; ZZ_p::init(ZZ(<font color="#ff8c00">17</font>));&nbsp;<font color="#0000ee"><i>// define GF(17)</i></font><br>
66 &nbsp;&nbsp; ZZ_p::init(ZZ(<font color="#ff8b00">17</font>)); <font color="#0000ed"><i>// define GF(17)</i></font><br>
6667 <br>
6768 &nbsp;&nbsp; ZZ_pX P;<br>
68 &nbsp;&nbsp; BuildIrred(P,&nbsp;<font color="#ff8c00">10</font>);&nbsp;<font color="#0000ee"><i>// generate an irreducible polynomial P</i></font><br>
69 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#0000ee"><i>// of degree 10 over GF(17)</i></font><br>
69 &nbsp;&nbsp; BuildIrred(P, <font color="#ff8b00">10</font>); <font color="#0000ed"><i>// generate an irreducible polynomial P</i></font><br>
70 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#0000ed"><i>// of degree 10 over GF(17)</i></font><br>
7071 <br>
71 &nbsp;&nbsp; ZZ_pE::init(P);&nbsp;<font color="#0000ee"><i>// define GF(17^10)</i></font><br>
72 &nbsp;&nbsp; ZZ_pE::init(P); <font color="#0000ed"><i>// define GF(17^10)</i></font><br>
7273 <br>
73 &nbsp;&nbsp; ZZ_pEX f, g, h;&nbsp;&nbsp;<font color="#0000ee"><i>// declare polynomials over GF(17^10)</i></font><br>
74 &nbsp;&nbsp; ZZ_pEX f, g, h;&nbsp;&nbsp;<font color="#0000ed"><i>// declare polynomials over GF(17^10)</i></font><br>
7475 <br>
75 &nbsp;&nbsp; random(f,&nbsp;<font color="#ff8c00">20</font>);&nbsp;&nbsp;<font color="#0000ee"><i>// f is a random, monic polynomial of degree 20</i></font><br>
76 &nbsp;&nbsp; SetCoeff(f,&nbsp;<font color="#ff8c00">20</font>);<br>
76 &nbsp;&nbsp; random(f, <font color="#ff8b00">20</font>);&nbsp;&nbsp;<font color="#0000ed"><i>// f is a random, monic polynomial of degree 20</i></font><br>
77 &nbsp;&nbsp; SetCoeff(f, <font color="#ff8b00">20</font>);<br>
7778 <br>
78 &nbsp;&nbsp; random(h,&nbsp;<font color="#ff8c00">20</font>);&nbsp;<font color="#0000ee"><i>// h is a random polynomial of degree less than 20</i></font><br>
79 &nbsp;&nbsp; random(h, <font color="#ff8b00">20</font>); <font color="#0000ed"><i>// h is a random polynomial of degree less than 20</i></font><br>
7980 <br>
80 &nbsp;&nbsp; g = MinPolyMod(h, f);&nbsp;<font color="#0000ee"><i>// compute the minimum polynomial of h modulo f</i></font><br>
81 &nbsp;&nbsp; g = MinPolyMod(h, f); <font color="#0000ed"><i>// compute the minimum polynomial of h modulo f</i></font><br>
8182 <br>
82 &nbsp;&nbsp;&nbsp;<font color="#b03060"><b>if</b></font>&nbsp;(g ==&nbsp;<font color="#ff8c00">0</font>) Error(<font color="#4a708b">&quot;oops (1)&quot;</font>);&nbsp;<font color="#0000ee"><i>// check that g != 0</i></font><br>
83 &nbsp;&nbsp; <font color="#b02f60"><b>if</b></font>&nbsp;(g == <font color="#ff8b00">0</font>) Error(<font color="#4a6f8b">&quot;oops (1)&quot;</font>); <font color="#0000ed"><i>// check that g != 0</i></font><br>
8384 <br>
84 &nbsp;&nbsp;&nbsp;<font color="#b03060"><b>if</b></font>&nbsp;(CompMod(g, h, f) !=&nbsp;<font color="#ff8c00">0</font>)&nbsp;<font color="#0000ee"><i>// check that g(h) = 0 mod f</i></font><br>
85 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Error(<font color="#4a708b">&quot;oops (2)&quot;</font>);<br>
85 &nbsp;&nbsp; <font color="#b02f60"><b>if</b></font>&nbsp;(CompMod(g, h, f) != <font color="#ff8b00">0</font>) <font color="#0000ed"><i>// check that g(h) = 0 mod f</i></font><br>
86 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Error(<font color="#4a6f8b">&quot;oops (2)&quot;</font>);<br>
8687 }<br>
87 </font>
88 </font></td></tr></table><p><p>
88 </font></font></td></tr></table><p><p>
8989 <!-- }}} ENDPRETTY -->
9090
9191
66 <center>
77 <a href="tour-ex5.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
88 <a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
9 <img src="arrow3.gif" alt="[Next]" align=bottom>
9 <a href="tour-ex7.html"><img src="arrow3.gif" alt="[Next]" align=bottom></a>
1010 </center>
1111
1212 <h1>
3434 <!-- STARTPLAIN
3535 #include <NTL/RR.h>
3636
37 using namespace std;
38 using namespace NTL;
39
3740 int main()
3841 {
3942 RR acc, val;
4851 <!-- STARTPRETTY {{{ -->
4952 <p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
5053 <font color="#1773cc">#include </font><font color="#4a6f8b">&lt;NTL/RR.h&gt;</font><br>
54 <br>
55 <font color="#b02f60"><b>using</b></font>&nbsp;<font color="#008b00"><b>namespace</b></font>&nbsp;std;<br>
56 <font color="#b02f60"><b>using</b></font>&nbsp;<font color="#008b00"><b>namespace</b></font>&nbsp;NTL;<br>
5157 <br>
5258 <font color="#008b00"><b>int</b></font>&nbsp;main()<br>
5359 {<br>
123129 <center>
124130 <a href="tour-ex5.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
125131 <a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
126 <img src="arrow3.gif" alt="[Next]" align=bottom>
132 <a href="tour-ex7.html"><img src="arrow3.gif" alt="[Next]" align=bottom></a>
127133 </center>
128134
129135 </body>
0 <html>
1 <head>
2 <title>
3 A Tour of NTL: Examples: Thread Pools</title>
4 </head>
5
6 <center>
7 <a href="tour-ex6.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
8 <a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
9 <img src="arrow3.gif" alt="[Next]" align=bottom>
10 </center>
11
12 <h1>
13 <p align=center>
14 A Tour of NTL: Examples: Thread Pools
15 </p>
16 </h1>
17
18 <p> <hr> <p>
19
20 If you have built NTL with <tt>NTL_THREAD_BOOST=on</tt>,
21 then not only is NTL thread safe, but certain parts
22 of NTL are designed to use multiple threads to speed things
23 up.
24 To implement this, NTL makes use of a <i>thread pool</i>,
25 which is a collection of threads that are created once
26 and then used over and over again, to avoid the significant
27 overhead of thread creation and destruction.
28 You can also use this same thread pool to speed up
29 NTL client code.
30 <p>
31 To use this feature, you have to include the header file
32 <tt>NTL/BasicThreadPool.h</tt>.
33 In your main program, you should also indicate how many threads
34 you want in the pool.
35 If you want, say, 8 threads, you so this by calling the function
36 <tt>SetNumThreads(8)</tt>.
37 <p>
38 If you do this, then certain parts of NTL will use these
39 threads when possible (this is a working in progress).
40 To use these threads in your own code, the easiest way
41 to do this is with a <i>parallel for loop</i>,
42 illustrated in the following example.
43
44 See <a href="BasicThreadPool.cpp.html"><tt>BasicThreadPool.txt</tt></a>
45 for more details.
46
47 Consider the following routine:
48
49
50 <!-- STARTPLAIN
51 void mul(ZZ *x, const ZZ *a, const ZZ *b, long n)
52 {
53 for (long i = 0; i < n; i++)
54 mul(x[i], a[i], b[i]);
55 }
56 ENDPLAIN -->
57 <!-- STARTPRETTY {{{ -->
58 <p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
59 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
60 &nbsp;&nbsp; {<br>
61 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = <font color="#ff8b00">0</font>; i &lt; n; i++)<br>
62 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; mul(x[i], a[i], b[i]);<br>
63 &nbsp;&nbsp; }<br>
64 </font></font></td></tr></table><p><p>
65 <!-- }}} ENDPRETTY -->
66
67
68
69
70
71
72 <p>
73 We can parallelize it as follows:
74
75 <!-- STARTPLAIN
76 void mul(ZZ *x, const ZZ *a, const ZZ *b, long n)
77 {
78 NTL_EXEC_RANGE(n, first, last)
79
80 for (long i = first; i < last; i++)
81 mul(x[i], a[i], b[i]);
82
83 NTL_EXEC_RANGE_END
84 }
85 ENDPLAIN -->
86 <!-- STARTPRETTY {{{ -->
87 <p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
88 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
89 &nbsp;&nbsp; {<br>
90 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE(n, first, last) <br>
91 <br>
92 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++)<br>
93 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mul(x[i], a[i], b[i]);<br>
94 <br>
95 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE_END<br>
96 &nbsp;&nbsp; }<br>
97 </font></font></td></tr></table><p><p>
98 <!-- }}} ENDPRETTY -->
99
100
101
102
103 <p>
104 <tt>NTL_EXEC_RANGE</tt> and
105 <tt>NTL_EXEC_RANGE_END</tt> are macros that just <i>do the right
106 thing</i>. If there are <i>nt</i> threads available, the interval
107 &#91;0..<i>n</i>) will be
108 partitioned into (up to) <i>nt</i> subintervals, and a different thread will be
109 used to process each subinterval. You still have to write the for loop
110 yourself: the macro just declares and initializes variables <i>first</i> and
111 <i>last</i> (or whatever you want to call them) of type <tt>long</tt>
112 that represent the
113 subinterval &#91;<i>first</i>..<i>last</i>) to be processed by one thread.
114
115
116
117 <p>
118 Note that the current thread participates as one of the <i>nt</i> available
119 threads, and that the current thread will wait for all participating threads
120 to finish their task before proceeding.
121
122 <p>
123 Withing the "body" of this construct, you can freely reference any variables
124 that are visible at this point. This is implemented using the C++ lambda
125 feature (capturing all variables by reference).
126
127 <p>
128 This construct will still work even if threads are disabled, in which case
129 it runs single-threaded with <i>first=0</i> and <i>last=n</i>.
130
131 <p>
132 Note that the code within the <tt>EXEC_RANGE</tt>
133 body could call other routines that
134 themselves attempt to execute an <tt>EXEC_RANGE</tt>:
135 if this happens, the latter
136 <tt>EXEC_RANGE</tt> will detect this and run single-threaded.
137
138 <p>
139 You may wish to do other things within the <tt>EXEC_RANGE</tt>
140 body than just execute
141 a loop. One thing you may want to do is to declare variables. Another
142 thing you may want to do is setup a local context
143 for a <tt>ZZ_p</tt> modulus (or
144 other type of modulus).
145 Here is an example of doing this:
146
147
148 <!-- STARTPLAIN
149 void mul(ZZ_p *x, const ZZ_p *a, const ZZ_p *b, long n)
150 {
151 ZZ_pContext context;
152 context.save();
153
154 NTL_EXEC_RANGE(n, first, last)
155
156 context.restore();
157
158 for (long i = first; i < last; i++)
159 mul(x[i], a[i], b[i]);
160
161 NTL_EXEC_RANGE_END
162 }
163 ENDPLAIN -->
164 <!-- STARTPRETTY {{{ -->
165 <p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
166 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;mul(ZZ_p *x, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_p *a, <font color="#008b00"><b>const</b></font>&nbsp;ZZ_p *b, <font color="#008b00"><b>long</b></font>&nbsp;n) <br>
167 &nbsp;&nbsp; {<br>
168 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ZZ_pContext context;<br>
169 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;context.save();<br>
170 <br>
171 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE(n, first, last) <br>
172 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <br>
173 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; context.restore();<br>
174 <br>
175 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#b02f60"><b>for</b></font>&nbsp;(<font color="#008b00"><b>long</b></font>&nbsp;i = first; i &lt; last; i++)<br>
176 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mul(x[i], a[i], b[i]);<br>
177 <br>
178 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NTL_EXEC_RANGE_END<br>
179 &nbsp;&nbsp; }<br>
180 </font></font></td></tr></table><p><p>
181 <!-- }}} ENDPRETTY -->
182
183
184
185
186 <p>
187 A lower-level set of tools is available, which allow for
188 more fine-grained control.
189 See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
190 for more details.
191
192 <center>
193 <a href="tour-ex6.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
194 <a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
195 <img src="arrow3.gif" alt="[Next]" align=bottom>
196 </center>
197
198 </body>
199 </html>
3333 <li> <a href="tour-ex4.html">Modular Arithmetic</a>
3434 <li> <a href="tour-ex5.html">Extension Rings and Fields</a>
3535 <li> <a href="tour-ex6.html">Floating Point Classes</a>
36 <li> <a href="tour-ex7.html">Thread Pools</a>
3637
3738
3839 </ol>
3333 The speedup is most dramatic on x86 machines.
3434
3535 <p>
36 By default, NTL uses a long integer package derived
36 As of version 9.6.3, NTL uses GMP by default.
37 You can disable GMP by passing <tt>NTL_GMP_LIP=off</tt>
38 as an option to NTL's <tt>configure</tt> script.
39 If you disable the use of GMP,
40 NTL uses a long integer package derived
3741 from Arjen Lenstra's LIP package.
38 However, for extra speed, it is recommended to use GMP.
39 Building NTL with GMP
40 takes a few extra minutes work,
41 and you certainly do not need to use NTL with GMP if you don't want to.
42 As far as I know, GMP is only available on Unix systems
43 and on Windows systems using Cygwin tools.
42 This is not recommended: GMP is much faster.
4443
4544 <p>
4645 Even if you do not use GMP,
5150 <p>
5251 <b>Note:</b> GMP is thread safe, so you should feel free to use it
5352 in a thread-safe build of NTL.
54 However, the current version (v6)
53 However, the current version of GMP (v6.1)
5554 is <i>not</i> entirely exception friendly (it may
5655 abort a running program, but only in some very extreme and
5756 unusal circumstances).
6261 <h2>
6362 Downloading and building GMP
6463 </h2>
64 <p>
65
66 Many unix distributions now include GMP by default.
67 But if not, it is pretty easy to install it directly from source,
68 as follows.
6569 <p>
6670
6771 Download GMP from <a href="http://gmplib.org">here.</a>
103107 </h2>
104108 <p>
105109
106 When building NTL with GMP, you have to tell NTL that you want to
107 use GMP as the long integer package,
108 and where the include files and library are.
109 The easiest way to do this is by passing the argument
110 <tt>NTL_GMP_LIP=on</tt> to the NTL configuration script
111 when you are <a href="tour-unix.html">installing NTL</a>.
112 Assuming you installed GMP in <tt>$HOME/sw</tt> as above,
110 When you are <a href="tour-unix.html">installing NTL</a>,
111 if you installed GMP in <tt>$HOME/sw</tt> as above,
113112 and you also want to install NTL in <tt>$HOME/sw</tt>,
114113 you execute:
115114 <pre>
116 % ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
115 % ./configure PREFIX=$HOME/sw GMP_PREFIX=$HOME/sw
117116 </pre>
118117 You can write this more simply as
119118 <pre>
120 % ./configure DEF_PREFIX=$HOME/sw NTL_GMP_LIP=on
119 % ./configure DEF_PREFIX=$HOME/sw
121120 </pre>
122121 Here, <tt>DEF_PREFIX</tt> is a variable that is used
123122 to specify the location of all software,
129128 standard system directory where your compiler will look by default)
130129 then simply
131130 <pre>
132 % ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on
131 % ./configure PREFIX=$HOME/sw
133132 </pre>
134133 does the job.
135134 Moreover, if NTL is also to be installed in <tt>/usr/local</tt>,
136135 then
137136 <pre>
138 % ./configure NTL_GMP_LIP=on
137 % ./configure
139138 </pre>
140139 does the job.
141140
156155 then just <tt>-lgmp</tt> does the job.
157156 Note that <tt>-lgmp</tt> must come <i>after</i> <tt>-lntl</tt>
158157 on the command line.
159 Finally, if NTL is installed
160 a shared libraries, then you don't even need <tt>-lgmp</tt>.
158 Finally, if NTL and GMP are installed
159 as shared libraries, then you don't even need <tt>-lgmp</tt>.
161160
162161
163162 <p>
164163 NTL has been tested and works correctly with GMP versions 3.1, 3.1.1,
165 4.1.4, and 5.1 (among others).
164 4.1.4, 5.1, 6.0, and 6.1 (among others).
166165 It is not possible to use versions of GMP prior to 3.1 with NTL.
167166
168167 <p>
186186 to doubles produce <i>exact</i> results, provided the inputs and outputs
187187 are less than <tt>2^p</tt> in absolute value,
188188 <li>
189 if <tt>y/2 &lt;= x &lt;= 2y</tt>, then <tt>x-y</tt> is computed exactly.
189 assuming no overflow, <tt>x - long(x)</tt> produces an exact result for nonnegative <tt>x</tt>.
190190 </ul>
191 Also, NTL allows the compiler to compute <tt>z = x/y</tt> as
192 <tt>t = 1/y</tt>, <tt>z = t*x</tt>.
193191
194192 <p>
195193 It is also generally assumed that the compiler does not
209207 Unfortunately, some compilers do not do this correctly,
210208 unless you tell them.
211209 With Intel's C compiler <tt>icc</tt>, for example,
212 you should compile NTL with the flag <tt>-fp-model source</tt>
210 you should compile NTL with the flag <tt>-fp-model strict</tt>
213211 to enforce strict adherence to floating point standards.
212 That said, some effort has been made to ensure that NTL
213 works correctly even if the compiler does perform such
214 regrouping, including replacement of <tt>x/y</tt>
215 by <tt>x*(1/y)</tt>.
216
217 <p>
214218 Also, you should be wary of compiling using an optimization
215219 level higher than the default <tt>-O2</tt> --
216220 this may break some floating point assumptions (and maybe
217221 some other assumptions as well).
218222
219
223 <p>
224 In any case, programs that compile against NTL header files
225 should compile correctly, even under very aggressive optimizations.
220226
221227 <p>
222228 One big problem with the IEEE standard is that it allows intermediate
246252 Hopefully, because of the newer SSE instructions, this whole strict/loose
247253 issue is a thing of the past.
248254
255 <p>
256 Another problem is that some hardware (especially newer Intel chips)
257 support fused multiply-add (FMA) instructions.
258 Again, this is only a problem for <tt>quad_float</tt>, and some
259 care is taken to detect the problem and to work around it.
260 The rest of NTL will work fine regardles.
261
249262
250263
251264 <p>
264277 and "not a number" are implemented correctly.
265278
266279
267 <p>
268 <h3>Implementing long integer arithmetic</h3>
269 <p>
270 There are two basic strategies for implementing long integer arithmetic.
271
272 <p>
273 The default strategy is implemented in the
274 <i>traditional long integer arithmetic package</i>.
275 This package is derived from the LIP package originally developed by
276 A. K. Lenstra, although it has evolved quite a bit within NTL.
277 This package uses no assembly code and is very portable.
278
279 <p>
280 The alternative strategy is to use GMP in place of LIP.
281 In this strategy, the representation of long integers is in a
282 form compatible with GMP.
283 This strategy typically yields the best performance,
284 but requires
285 that GMP is installed on your platform.
286
287 <p>
288 <a href="tour-gmp.html">Go here</a> for more details on the use
289 of GMP with NTL.
280
290281
291282 <p>
292283 <h3>Algorithms</h3>
324315 </h3>
325316 <p>
326317 As of v7.0, NTL is thread safe.
327 That said, there are two things to be aware of:
318 That said, there are several things to be aware of:
328319 <ul>
329 <li>
330 While extreme care has been taken with the design and implementation
331 of thread safety, this feature is still very new and not been
332 subjected to a lot of testing.
333 <li>
334 <p>
320
321 <li>
335322 To use this feature, you have to enable <tt>NTL_THREADS</tt>
336323 in the configuration script.
337324 Also, you will need a compiler and runtime library that
338325 implements several key <tt>C++11</tt> features,
339326 including <tt>thread_local</tt> storage.
340
341 <p>
342 <i>
343 As I wrote this in Nov. 2014, there were very few compilers
344 that satisfy the requirements.
345 I was successfully able to build and test NTL with threads
346 using gcc 4.9.2 on a Linux system.
347 I have not been able to do so on any current Mac OSX system:
348 in fact, no compiler on current Mac systems (gcc or clang)
349 yet have proper support for <tt>thread_local</tt> storage.
350 </i>
351
352 <p>
353 <li>
354 NTL remains thread safe when built with GMP.
355 however, the current version (v1.1) of the external gf2x
327 <ul>
328 <li>
329 NOTE: as of v9.8, the requirements have been relaxed, so that
330 for gcc and gcc-compatible compilers
331 (such as clang and icc) only support of the gcc <tt>__thread</tt>
332 storage specifier is required.
333 <li>
334 With these relaxed requirements, it is possible to build
335 a thread safe version of NTL on Linux using gcc 4.8 and above,
336 or on Mac OSX 10.10 and above.
337
338 </ul>
339
340 <p> <li>
341 You must build NTL using GMP (i.e., configure with <tt>NTL_GMP_LIP=on</tt>).
342 The classic LIP integer arithmetic is not thread safe: it could
343 be made so, but it is not a priority at this time.
344
345 <p>
346 <li>
347 The current version (v1.1) of the external gf2x
356348 library is not thread safe.
357 Therefore, <b>you should build NTL using gf2x if you need a thread-safe
349 Therefore, <b>you should NOT build NTL using gf2x if you need a thread-safe
358350 build</b>.
359351 </ul>
360352
415407
416408 <p>
417409 <h3>
410 Thread Boosting
411 </h3>
412 <p>
413
414 As of v9.5.0, NTL provides a <i>thread boosting</i> feature.
415 With this feature, certain code within NTL will use available
416 threads to speed up computations on a multicore
417 machine.
418 This feature is enabled by setting <tt>NTL_THREAD_BOOST=on</tt>
419 during configuration.
420 See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
421 for more information.
422
423 <p>
424 This feature is a work in progress.
425 Currently, basic <tt>ZZ_pX</tt> arithmetic has been thread boosted.
426 More code will be boosted later.
427
428
429 <p>
430 <h3>
418431 Error Handling and Exceptions
419432 </h3>
420433 <p>
8080 <p>
8181
8282 <table >
83
84
85
86 <!-- ----------- BasicThreadPool.txt ----------- -->
87 <p><tr valign=top> <td> <b>
88 <a href="BasicThreadPool.cpp.html"><tt>BasicThreadPool</tt></a>
89
90 </b> <td>
91
92 class <tt>BasicThreadPool</tt>: a simple thread pool;
93 plus additional <i>thread boosting</i> features
8394
8495
8596
1919 <p> <hr> <p>
2020
2121 Here are some timing figures from using NTL.
22 They were obtained using NTL 8.1 compiled with <tt>g++</tt> 4.2.1
23 and with GMP 5.1 on a 2.8GHz Intel Core 2 Duo running on Max OSX 10.6.8.
22 They were obtained using NTL 9.9.0 compiled with <tt>g++</tt> 4.8.5
23 and with GMP 6.1 on a 2.3GHz Intel Haswell processor
24 (E5-2698 v3) running Linux.
2425
2526 <p>
2627 All times are ins <i>seconds</i>.
3132 generate the same data.
3233
3334 <p>
35 NOTE: that the PRG changed in v9.4.0, so there may be
36 some inconsistencies.
37
38 <p>
3439 <pre>
3540
36 multiply 1000-bit ints: 4.47942e-07
37 remainder 2000/1000-bit ints: 8.3923e-07
38 gcd 1000-bit ints: 1.07981e-05
39 multiply degree-1000 poly mod 1000-bit prime: 0.0140632
40 remainder degree-2000/1000 poly mod 1000-bit prime: 0.0386337
41 preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.0143691
42 gcd degree-1000 poly mod 1000-bit prime: 0.338774
43 multiply degree-1000 int poly with 1000-bit coeffs: 0.0162582
41 multiply 1000-bit ints: 1.76284e-07
42 remainder 2000/1000-bit ints: 3.60535e-07
43 gcd 1000-bit ints: 2.87045e-06
44 multiply degree-1000 poly mod 1000-bit prime: 0.00432981
45 remainder degree-2000/1000 poly mod 1000-bit prime: 0.0125583
46 preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00443356
47 gcd degree-1000 poly mod 1000-bit prime: 0.122722
48 multiply degree-1000 int poly with 1000-bit coeffs: 0.00812543
4449
4550 factoring degree-1000 poly mod 1000-bit prime...
46 square-free decomposition...0.339693
51 square-free decomposition...0.122685
4752 factoring multiplicity 1, deg = 1000
48 computing X^p...22.9192
49 computing DDF...generating baby steps...+++++++++++++++++++++16.9463
50 generating giant steps...++++++++++++++++++++++17.695
51 giant refine...++++split 1 43
52 split 2 38
53 split 3 64
54 *++++split 5 108
55 *++++split 11 237
56 split 12 510
57 *giant refine time: 8.19123
58 baby refine...split 3 6
59 split 6 6
60 split 9 9
61 split 22 22
62 split 38 38
63 split 64 64
64 split 108 108
65 split 237 237
66 split 248 248
67 split 262 262
68 baby refine time: 0.611474
69 DDF time: 43.4528
70 computing EDF(3,2)...+0.077447
71 ...total time = 66.8182
53 computing X^p...7.23809
54 computing DDF...generating baby steps...+++++++++++++++++++++6.21623
55 generating giant steps...++++++++++++++++++++++6.49462
56 giant refine...++++split 1 1
57 split 2 26
58 *++++*++++*++++*++++*++*split 0 973
59 giant refine time: 4.8139
60 baby refine...split 1 1
61 split 26 26
62 split 973 973
63 baby refine time: 3.3e-05
64 DDF time: 17.5262
65 ...total time = 24.8965
7266
73 multiply 500-bit GF2Xs: 1.03529e-06
74 remainder 1000/500-bit GF2Xs: 6.06942e-06
75 gcd 500-bit GF2Xs: 1.19302e-05
67 multiply 500-bit GF2Xs: 5.54208e-08
68 remainder 1000/500-bit GF2Xs: 8.40658e-07
69 gcd 500-bit GF2Xs: 3.60963e-06
7670
77 factoring degree-500 GF2X: 0.0010868
78 gcd 500-bit GF2X: 1.18877e-05
79 multiply degree-500 poly mod 500-bit GF2X: 0.024646
80 remainder degree-1000/500 poly mod 500-bit GF2X: 0.0884258
81 preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.049274
82 gcd degree-500 poly mod 500-bit GF2X: 0.555096
71 factoring degree-500 GF2X: 0.00015574
72 gcd 500-bit GF2X: 3.61365e-06
73 multiply degree-500 poly mod 500-bit GF2X: 0.00251375
74 remainder degree-1000/500 poly mod 500-bit GF2X: 0.00905957
75 preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00505149
76 gcd degree-500 poly mod 500-bit GF2X: 0.0478557
8377
8478 factoring degree-500 poly mod 500-bit GF2X...
85 square-free decomposition...0.038657
79 square-free decomposition...0.004635
8680 factoring multiplicity 1, deg = 250
87 computing X^p...4.93964
88 computing DDF...generating baby steps...++++++++++3.51054
89 generating giant steps...+++++++++++3.78422
90 giant refine...++++*++++split 6 59
91 split 7 68
92 *split 0 123
93 giant refine time: 2.59123
94 baby refine...split 59 59
95 split 68 68
96 split 123 123
97 baby refine time: 3.4e-05
98 DDF time: 9.88619
81 computing X^p...0.488941
82 computing DDF...generating baby steps...++++++++++0.332162
83 generating giant steps...+++++++++++0.357681
84 giant refine...++++split 1 9
85 split 2 13
86 split 4 44
87 *++++split 7 73
88 *split 0 111
89 giant refine time: 0.233787
90 baby refine...split 9 9
91 split 13 13
92 split 44 44
93 split 73 73
94 split 111 111
95 baby refine time: 0.001275
96 DDF time: 0.924938
9997
100 ...total time = 14.879
98 ...total time = 1.41792
99
101100
102101
103102 </pre>
1919 <p> <hr> <p>
2020
2121 <ol>
22
23 <li>
24 Build NTL using GMP as the long integer package.
25 This is extremely important, as the GMP implementation
26 of long integer arithmetic is <i>much</i> faster
27 than the default implementation.
28 Go <a href="tour-gmp.html">here</a> for details.
29
30 <p>
31 <li>
32 On many machines that optionally offer 64-bit integer arithmetic
33 (recent Mac OSX machines, for instance),
34 you should
35 compile using <tt>gcc</tt> with the option <tt>-m64</tt>
36 to get the full benefit.
37 To do this,
38 pass <tt>"CFLAGS=-O2 -m64"</tt>
39 to the <tt>configure</tt> script (note the use of quotes).
40 If you are using NTL with GMP on such a machine,
41 you <i>must</i> do this to get compatible code.
42 Note, however, that 64-bit is becoming the default, so this
43 may not be necessary.
44
45 <p>
46 <li>
47 On Sparcs,
48 pass the argument <tt>"CFLAGS=-O2 -mcpu=v8"</tt>
49 to the <tt>configure</tt> script.
50 On more recent, 64-bit sparcs, pass <tt>"CFLAGS=-O2 -mcpu=v9 -m64"</tt>
51 to get the full instruction set and 64-bit code.
5222
5323 <p>
5424 <li>
12393 <tt>a[i]*b[i]</tt>, in every loop iteration.
12494 The second does not.
12595
96 <p>
97 NOTE: actually, for the class <tt>ZZ</tt>, there is a
98 special function <tt>MulAddTo</tt>, with whic one can write
99 the loop body simply as
100 <pre>
101 MulAddTo(res, a[i], b[i]);
102 </pre>
103
126104
127105
128106
133111 If you <i>must</i> switch the modulus often,
134112 use the class <tt>ZZ_pContext</tt> to save the information
135113 associated with the modulus (see <a href="ZZ_p.cpp.html">ZZ_p.txt</a>).
114 The same holds for analogous classes, such as <tt>zz_p</tt>
115 and <tt>GF2E</tt>.
136116
137117
138118
3737 % gunzip ntl-xxx.tar.gz
3838 % tar xf ntl-xxx.tar
3939 % cd ntl-xxx/src
40 % ./configure PREFIX=$HOME/sw
40 % ./configure
4141 % make
4242 % make check
4343 % make install
4444 </pre>
4545
46 This will build, test, and install NTL in <tt>$HOME/sw</tt>.
47 Of course, change <tt>$HOME/sw</tt> to whatever you want (the default is
48 <tt>/usr/local</tt>).
49 You will find the NTL header files in <tt>$HOME/sw/include/NTL</tt>
46 This will build, test, and install NTL in
47 <tt>/usr/local</tt>.
48 For this to work, GMP must already be installed
49 (most Unix distributions already come with GMP installed,
50 but see <a href="tour-gmp.html">this page</a> for more
51 details).
52 If you really do not want to use GMP,
53 you can pass the option
54 <tt>NTL_GMP_LIP=off</tt>
55 to <tt>configure</tt>.
56
57 <p>
58 After installation,
59 you will find the NTL header files in <tt>/usr/local/include/NTL</tt>
5060 and the compiled binary
51 in <tt>$HOME/sw/lib/libntl.a</tt>
61 in <tt>/usr/local/lib/libntl.a</tt>
5262 (this is a <i>static</i> library -- if you want a <i>shared</i>
5363 library, <a href="#shared">see below</a>).
54 <p>
55 If you really are interested in high-performace, you will
56 <i>definitely</i> want to build NTL
57 using GMP (the GNU Multi-Precision package).
58 If GMP has already been installed in a standard
59 place, like <tt>/usr/local</tt>, then invoke <tt>configure</tt>
60 above as
61 <pre>
62 % ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on
63 </pre>
64 and if GMP is installed somewhere else, say <tt>$HOME/sw</tt>, then
65 either
66 <pre>
67 % ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
68 </pre>
69 or, more simply,
70 <pre>
71 % ./configure DEF_PREFIX=$HOME/sw NTL_GMP_LIP=on
72 </pre>
73 does the job.
74 Here, <tt>DEF_PREFIX</tt> is a variable that is used
75 to specify the location of all software,
76 and it defaults to <tt>/usr/local</tt>.
77 <a href="tour-gmp.html">This page</a> provides more
78 details.
64 Documentation is in <tt>/usr/local/share/doc</tt>,
65 with the main entry-point at <tt>/usr/local/share/doc/tour.html</tt>.
66
67
7968
8069 <p>
8170 If you want very high-performance for polynomial arithmetic
8271 over <i>GF(2)</i>, you may want to consider using the <tt>gf2x</tt> library.
83 To do this, <tt>gf2x</tt> must already be installed somewhere.
72 To do this, <tt>gf2x</tt> must already be installed.
8473 In addition, you should invoke <tt>configure</tt>
8574 with the option <tt>NTL_GF2X_LIB=on</tt>.
86 If <tt>gf2x</tt> is installed in a standard location, this is
87 all you need to do;
88 otherwise, if <tt>gf2x</tt> is installed, say, in <tt>$HOME/sw</tt>,
89 then you also need to pass the option <tt>GF2X_PREFIX=$HOME/sw</tt>.
9075 <a href="tour-gf2x.html">This page</a> provides more details.
9176
92 <p>
93 Even if you don't want to experiment with the <tt>gf2x</tt>
94 library, you might want to try setting <tt>NTL_PCLMUL=on</tt>,
95 which will enable the use special hardware support for fast
96 polynomial arithmetic over <i>GF(2)</i> on platforms that support it
97 (the configure script will check that it actually works).
98 You can set <tt>NTL_PCLMUL=on</tt> even if you also set
99 <tt>NTL_GF2X_LIB=on</tt>, but it probably won't help much.
77
78
79 <p>
80 If you want to install NTL install NTL somewhere besides <tt>/usr/local</tt>,
81 pass the option <tt>PREFIX=/path/to/install/ntl</tt> to
82 <tt>configure</tt>.
83 If GMP is installed somewhere besides <tt>/usr/local</tt>,
84 pass the optopn
85 <tt>GMP_PREFIX=/path/to/gmp</tt>
86 to <tt>configure</tt>.
87 You can also pass
88 <tt>GF2X_PREFIX=/path/to/gf2x</tt>
89 to <tt>configure</tt>,
90 if <tt>gf2x</tt> is installed somewhere besides <tt>/usr/local</tt>.
91 As a shorthand, you pass the option
92 <tt>DEF_PREFIX=/path/to/all/software</tt>, which will
93 override the default for <tt>PREFIX</tt>,
94 <tt>GMP_PREFIX</tt>, and <tt>GF2X_PREFIX</tt>.
95
10096
10197
10298 <p>
10399 Now suppose you want to compile a program that uses NTL.
104 Suppose you are working in some directory and <tt>foo.c</tt>
100 Suppose you are working in some arbitrary directory and <tt>foo.c</tt>
105101 is your program.
106 Assume that you have installed NTL in <tt>$HOME/sw</tt> as above.
102 Assume that you have installed NTL in <tt>/usr/local</tt> as above.
107103 The following should work:
108104 <pre>
109 % g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lm
110 </pre>
111 If you are using GMP, then:
112 <pre>
113 % g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lgmp -lm
114 </pre>
115 If you are using GMP and <tt>gf2x</tt>, then
116 <pre>
117 % g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lgmp -lgf2x -lm
118 </pre>
119
105 % g++ -g -O2 foo.c -o foo -lntl -lgmp -lm
106 </pre>
107 If you have installed NTL and/or GMP is a non-standard location,
108 say <tt>/path/to/sw</tt>,
109 then:
110 <pre>
111 % g++ -g -O2 -I/path/to/sw/include foo.c -o foo -L/path/to/sw/lib -lntl -lgmp -lm
112 </pre>
113 If you build NTL with <tt>gf2x</tt>, just add the option
114 <tt>-lgf2x</tt> to the above, right after <tt>-lgmp</tt>.
115
116 <p>
117 If you are working in the NTL <tt>src</tt> directory itself,
118 you can just run:
119 <pre>
120 % make foo
121 </pre>
122 to compile a program <tt>foo.c</tt>, as above.
120123
121124 <p>
122125 <h2>
134137 <pre>
135138 % gunzip ntl-xxx.tar.gz
136139 % tar xvf ntl-xxx.tar
140 </pre>
141
142 On most systems, the following shortcut works:
143 <pre>
144 % tar xzvf ntl-xxx.tar.gz
137145 </pre>
138146
139147 <p>
180188 <pre>
181189
182190 CXX=g++ # The C++ compiler
191
183192 CXXFLAGS=-g -O2 # C++ complilation flags
184193
185 DEF_PREFIX=/usr/local # Default software directory
194 NATIVE=on # Compiles code targeted to the current hardware
195
196 DEF_PREFIX=/usr/local# Default software directory
197
186198 PREFIX=$(DEF_PREFIX) # Directory in which to install NTL library components
187199 SHARED=off # Generate a shared library (as well as static)
188200
189201 NTL_THREADS=off # compile in thread-safe mode
202 NTL_THREAD_BOOST=off # compile with thread boosting enabled
190203 NTL_EXCEPTIONS=off # compile with exceptions enabled
191204
192 NTL_GMP_LIP=off # Switch to enable the use of GMP as primary
205 NTL_GMP_LIP=on # Switch to enable the use of GMP as primary
193206 # long integer package
194207
195208 GMP_PREFIX=$(DEF_PREFIX) # Directory in which GMP components are installed
196
197 NTL_PCLMUL=off # switch to enable PCLMUL instruction for
198 # faster arithmetic in GF(2)[X]
199209
200210 NTL_GF2X_LIB=off # Switch to enable the use of the gf2x package
201211 # for faster arithmetic GF(2)[X]
228238
229239 <p>
230240 <li>
231 If you want to use, say, the options <tt>-g</tt> and <tt>-O</tt> for
232 compiling <tt>C</tt> and <tt>C++</tt>, run:
233 <pre>
234 % ./configure "CXXFLAGS=-g -O"
235 </pre>
236 Note the use of quotes to keep the argument in one piece.
237 Also note that the configuration script will sometimes
238 automatically adjust <tt>CXXFLAGS</tt>, depending on
239 other configuration flags that are set
240 (specifically, the <tt>NTL_PCLMUL</tt>, <tt>NTL_THREADS</tt>,
241 and <tt>NTL_EXCEPTIONS</tt> flags).
242 However, these automatic adjustments <i>will not</i>
243 be done if you explicitly set <tt>CXXFLAGS</tt> yourself:
244 you are on your own, then.
245
246
247 <p>
248 <li>
249 If <a href="tour-gmp.html">GMP (the GNU Multi-Precision package)</a>
250 is installed in a standard system directory, and you want to use it
251 to obtain better performance for long integer arithemtic, run:
252 <pre>
253 % ./configure NTL_GMP_LIP=on
254 </pre>
241 If you want to use, say, the options <tt>-g</tt>, <tt>-O3</tt>,
242 compiling <tt>C++</tt>, run:
243 <pre>
244 % ./configure "CXXFLAGS=-g -O3"
245 </pre>
246
247
248 <p>
255249 If GMP was installed in
256250 <tt>$HOME/sw</tt>,
257251 run:
258252 <pre>
259 % ./configure NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
253 % ./configure GMP_PREFIX=$HOME/sw
260254 </pre>
261255 Go <a href="tour-gmp.html">here</a> for complete details.
262256
293287 description.
294288
295289 <p>
296 Note that all of these configuration options can also be set
290 Note that many of these configuration options can also be set
297291 by editing the two files <tt>makefile</tt>
298292 and <tt>../include/NTL/config.h</tt> by hand.
299293 These files are fairly simple and well documented, and so this is not
305299 and that the file "<tt>def_makefile</tt>"
306300 contains a backup copy of the original <tt>makefile</tt> file.
307301
308 <p>
309 This command is intended only as a convenience
310 and -- more importantly -- to allow the configuration process
311 to be script driven.
312 This script does not perform any "magic", like finding out what
313 the local C compiler is called, etc.
314 If the defaults are not
315 correct for your platform, you have to set an appropriate variable.
316302
317303
318304
344330
345331 <p>
346332 <li>
347 A script is run that "automagically"
348 determines the best way to write a timing function
349 on your platform.
350 It tries different routines in the files <tt>GetTime1.c</tt>,
351 <tt>GetTime2.c</tt>, etc., and when it finds a good one,
352 it copies the file into <tt>GetTime.c</tt>.
353 A similar script is run to "automagically"
354 determine if there is something like a <tt>getpid</tt>
355 function available on your platform.
356
333 Several scripts are run to obtain more information
334 about your system (e.g.,
335 to find a timing function, a "getpid" function,
336 and to detect if things like Intel AVX intrinsics work).
357337
358338
359339 <p>
360340 <li>
361341 The file "<tt>../include/NTL/gmp_aux.h</tt>"
362342 is generated for use with GMP.
363 If not using GMP, this files are still created, but it is empty.
343 If not using GMP, this file is still created, but it is empty.
364344
365345
366346 <p>
381361 <pre>
382362 NTL_LONG_LONG NTL_AVOID_FLOAT NTL_TBL_REM NTL_TBL_REM_LL NTL_AVOID_BRANCHING
383363 NTL_SPMM_ULL NTL_SPMM_ASM NTL_GF2X_NOINLINE NTL_GF2X_ALTCODE
384 NTL_GF2X_ALTCODE1 NTL_FFT_LAZYMUL NTL_FFT_BIGTAB
364 NTL_GF2X_ALTCODE1 NTL_FFT_LAZYMUL NTL_FFT_BIGTAB NTL_PCLMUL
385365 </pre>
386366 which are set by the wizard.
387367 Also note that if you <i>do not</i> want the wizard to run,
422402 copies a number of files to a directory <tt>&lt;prefix&gt;</tt> that you
423403 specify by passing <tt>PREFIX=&lt;prefix&gt;</tt>
424404 as an argument to <tt>configure</tt> at configuration time,
425 or as an argument to <tt>make install</tt> at installation time.
405 or as an argument to <tt>make install</tt> at installation time
406 (e.g., <tt>make install PREFIX=/path/to/sw</tt>).
426407 The default is <tt>/usr/local</tt>, so either you need root
427408 permissions, or you choose a <tt>&lt;prefix&gt;</tt> for which
428409 you have write permission.
450431 <tt>ntl.a</tt>.
451432 To rebuild after executing <tt>make clean</tt>, execute <tt>make ntl.a</tt>.
452433
453
454 <p>
455 Assuming you have installed NTL as above,
456 to compile a program <tt>foo.c</tt> that uses NTL,
457 execute
458 <pre>
459 g++ -I&lt;prefix&gt;/include foo.c -o foo -L&lt;prefix&gt;/lib -lntl -lm
460 </pre>
461 This compiles <tt>foo.c</tt> as a <tt>C++</tt> program
462 and creates the binary <tt>foo</tt>.
463 <p>
464 If you built NTL using <a href="tour-gmp.html">GMP</a>, execute:
465 <pre>
466 g++ -I&lt;prefix&gt;/include foo.c -o foo -L&lt;prefix&gt;/lib -lntl -L&lt;gmp_prefix&gt;/lib -lgmp -lm
467 </pre>
468 <p>
469 Of course, if <tt>&lt;prefix&gt;</tt> and <tt>&lt;gmp_prefix&gt;</tt>
470 are the same, you do not need to duplicate the <tt>-L</tt>
471 flags, and if either are standard directories, like <tt>/usr/local</tt>,
472 you can leave out the corresponding <tt>-I</tt> and <tt>-L</tt>
473 flags altogether.
474 <p>
475 Similarly, if you built NTL using <a href="tour-gf2x.html"><tt>gf2x</tt></a>,
476 you should include flags
477 <pre>
478 -L&lt;gf2x_prefix&gt;/lib -lgf2x
479 </pre>
480 on the command line.
481 <p>
482 This works even if you are not working in the directory
483 in which you built NTL.
484 If you <i>are</i> working in that directory, you can just execute
485 <pre>
486 make foo
487 </pre>
488434
489435 <p>
490436 <h2>
3939 <p>
4040 <b>Windows Users:</b>
4141 you should consider using a Unix emulation environment like
42 <a href="http://www.mingw.org/">MinGW</a>
43 or <a href="https://www.cygwin.com/">Cygwin</a>, instead of
42 <a href="https://www.cygwin.com/">Cygwin</a>
43 or
44 <a href="http://www.mingw.org/">MinGW</a>,
45 instead of
4446 Microsoft development tools.
47 <p>
4548 Why?
4649 <ul>
4750 <li>
48 MinGW uses gcc, which generally adheres closer to language
51 These environments use gcc, which generally adheres closer to language
4952 standards and produces more efficient code that Microsoft's
5053 compiler.
5154 <p><li>
52 With MinGW, you can use NTL's
55 With these environments, you can use NTL's
5356 <a href="tour-unix.html">Unix distribution</a>,
5457 and the installation is almost entirely automatic:
5558 no pointing and clicking -- not much more
5861 and run NTL's performance-tuning Wizard.
5962 These factors combined can make a hige difference in performance,
6063 easily giving you a huge (10x or more) performance improvement.
64 <p><li>
65 On 64-bit machines, you should definitely consider Cygwin:
66 the 64-bit version of Cygwin gives you an
67 <a href="https://en.wikipedia.org/wiki/64-bit_computing">LP64 data model</a>,
68 which for many reasons is preferable to the Windows data model.
69 In particular, you will get the most performance out of NTL
70 in this environment.
6171 </ul>
6272
6373
6474
75
76 <p>
77 The remaining instructions on this page only apply
78 if you <i>do not</i> use a Unix emulation environment
79 like Cygwin or MinGW.
80
81 <p>
82 If you really want to get the most out of NTL, please stop,
83 and seriously consider using a Unix emulation environment
84 and
85 NTL's
86 <a href="tour-unix.html">Unix distribution</a>.
87 Your code will be much snappier, and your quality of life
88 will be much better.
89
90 <p>
91 You have been warned.
92
6593 <p>
6694 <b>
6795 Obtaining and unpacking NTL.
6896 </b>
97
6998 <p>
7099
71100 To obtain the source code and documentation for NTL,
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_GF2.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_GF2.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_GF2E.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_GF2E.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_RR.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_RR.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_lzz_p.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_lzz_p.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_lzz_pE.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_lzz_pE.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vector.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vector.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
109109 <font color="#0000ed"><i>that NTL's strategy.&nbsp;&nbsp;However, the new &quot;move semantics&quot;, introduced in C++11,</i></font><br>
110110 <font color="#0000ed"><i>mitigate this issue somewhat.</i></font><br>
111111 <br>
112 <font color="#0000ed"><i>Because of NTL's relocatability requirement, it is not recommended to</i></font><br>
113 <font color="#0000ed"><i>use NTL vectors over classes coming from the standard library, which</i></font><br>
114 <font color="#0000ed"><i>may not satisfy the requirement.</i></font><br>
112 <font color="#0000ed"><i>Because of NTL's relocatability requirement, it is not recommended to use NTL</i></font><br>
113 <font color="#0000ed"><i>vectors over classes coming from the standard library, which may not satisfy</i></font><br>
114 <font color="#0000ed"><i>the requirement.&nbsp;&nbsp;In those cases, you could either use an STL vector, or use an</i></font><br>
115 <font color="#0000ed"><i>NTL vector and wrap the suspect classes in an NTL smart pointer of some kind</i></font><br>
116 <font color="#0000ed"><i>(SmartPtr or OptionalVal).</i></font><br>
115117 <br>
116118 <font color="#0000ed"><i>Note also that Facebook's open source &quot;folly&quot; library also provides</i></font><br>
117119 <font color="#0000ed"><i>a vector class that uses realloc in a manner very similar to NTL's vector class.</i></font><br>
198200 &nbsp;&nbsp;<br>
199201 &nbsp;&nbsp; T* elts();<br>
200202 &nbsp;&nbsp; <font color="#008b00"><b>const</b></font>&nbsp;T* elts() <font color="#008b00"><b>const</b></font>;<br>
201 &nbsp;&nbsp; <font color="#0000ed"><i>// returns address of first vector element (or 0 if no space has</i></font><br>
202 &nbsp;&nbsp; <font color="#0000ed"><i>// been allocated for this vector).&nbsp;&nbsp;If a vector potentially has</i></font><br>
203 &nbsp;&nbsp; <font color="#0000ed"><i>// length 0, it is safer to write v.elts() instead of &amp;v[0].</i></font><br>
204 &nbsp;&nbsp; <font color="#0000ed"><i>// The first version is applied to non-const Vec&lt;T&gt;,</i></font><br>
205 &nbsp;&nbsp; <font color="#0000ed"><i>// and returns a non-const pointer to a T, while the second version</i></font><br>
206 &nbsp;&nbsp; <font color="#0000ed"><i>// is applied to a const Vec&lt;T&gt; and returns a const reference to a T.</i></font><br>
203 &nbsp;&nbsp; <font color="#0000ed"><i>// returns address of first vector element (or 0 if no space has been</i></font><br>
204 &nbsp;&nbsp; <font color="#0000ed"><i>// allocated for this vector).&nbsp;&nbsp;If a vector potentially has length 0, it is</i></font><br>
205 &nbsp;&nbsp; <font color="#0000ed"><i>// safer to write v.elts() instead of &amp;v[0]: the latter is not well defined</i></font><br>
206 &nbsp;&nbsp; <font color="#0000ed"><i>// by the C++ standard (although this is likely an academic concern).</i></font><br>
207 &nbsp;&nbsp; <font color="#0000ed"><i>//</i></font><br>
208 &nbsp;&nbsp; <font color="#0000ed"><i>// The first version is applied to non-const Vec&lt;T&gt;, and returns a non-const</i></font><br>
209 &nbsp;&nbsp; <font color="#0000ed"><i>// pointer to a T, while the second version is applied to a const Vec&lt;T&gt; and</i></font><br>
210 &nbsp;&nbsp; <font color="#0000ed"><i>// returns a const reference to a T.</i></font><br>
207211 <br>
208212 &nbsp;&nbsp; <br>
209213 &nbsp;&nbsp; <font color="#008b00"><b>void</b></font>&nbsp;swap(Vec&lt;T&gt;&amp; y);<br>
101101 that NTL's strategy. However, the new "move semantics", introduced in C++11,
102102 mitigate this issue somewhat.
103103
104 Because of NTL's relocatability requirement, it is not recommended to
105 use NTL vectors over classes coming from the standard library, which
106 may not satisfy the requirement.
104 Because of NTL's relocatability requirement, it is not recommended to use NTL
105 vectors over classes coming from the standard library, which may not satisfy
106 the requirement. In those cases, you could either use an STL vector, or use an
107 NTL vector and wrap the suspect classes in an NTL smart pointer of some kind
108 (SmartPtr or OptionalVal).
107109
108110 Note also that Facebook's open source "folly" library also provides
109111 a vector class that uses realloc in a manner very similar to NTL's vector class.
190192
191193 T* elts();
192194 const T* elts() const;
193 // returns address of first vector element (or 0 if no space has
194 // been allocated for this vector). If a vector potentially has
195 // length 0, it is safer to write v.elts() instead of &v[0].
196 // The first version is applied to non-const Vec<T>,
197 // and returns a non-const pointer to a T, while the second version
198 // is applied to a const Vec<T> and returns a const reference to a T.
195 // returns address of first vector element (or 0 if no space has been
196 // allocated for this vector). If a vector potentially has length 0, it is
197 // safer to write v.elts() instead of &v[0]: the latter is not well defined
198 // by the C++ standard (although this is likely an academic concern).
199 //
200 // The first version is applied to non-const Vec<T>, and returns a non-const
201 // pointer to a T, while the second version is applied to a const Vec<T> and
202 // returns a const reference to a T.
199203
200204
201205 void swap(Vec<T>& y);
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/version.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/version.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
00 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
11 <html>
22 <head>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/xdouble.cpp.html</title>
3 <title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/xdouble.cpp.html</title>
44 <meta name="Generator" content="Vim/7.1">
55 <meta http-equiv="content-type" content="text/html; charset=UTF-8">
66 </head>
0
1 #ifndef NTL_BasicThreadPool__H
2 #define NTL_BasicThreadPool__H
3
4 #include <NTL/tools.h>
5 #include <NTL/vector.h>
6 #include <NTL/SmartPtr.h>
7 #include <NTL/thread.h>
8
9
10 NTL_OPEN_NNS
11
12
13 inline long AvailableThreads();
14
15 struct PartitionInfo {
16 long nintervals; // number of intervals
17 long intervalsz; // interval size
18 long nsintervals; // number of small intervals
19
20 explicit
21 PartitionInfo(long sz, long nt = AvailableThreads())
22 // partitions [0..sz) into nintervals intervals,
23 // so that there are nsintervals of size intervalsz-1
24 // and nintervals-nsintervals of size intervalsz
25 {
26 if (sz <= 0) {
27 nintervals = intervalsz = nsintervals = 0;
28 return;
29 }
30
31 if (nt <= 0) LogicError("PartitionInfo: bad args");
32
33 // NOTE: this overflow check probably unnecessary
34 if (NTL_OVERFLOW(sz, 1, 0) || NTL_OVERFLOW(nt, 1, 0))
35 ResourceError("PartitionInfo: arg too big");
36
37 if (sz < nt) {
38 nintervals = sz;
39 intervalsz = 1;
40 nsintervals = 0;
41 return;
42 }
43
44 nintervals = nt;
45
46 long q, r;
47 q = sz/nt;
48 r = sz - nt*q;
49
50 if (r == 0) {
51 intervalsz = q;
52 nsintervals = 0;
53 }
54 else {
55 intervalsz = q+1;
56 nsintervals = nt - r;
57 }
58 }
59
60 long NumIntervals() const { return nintervals; }
61
62 void interval(long& first, long& last, long i) const
63 // [first..last) is the ith interval -- no range checking is done
64 {
65
66 #if 0
67 // this is the logic, naturally expressed
68 if (i < nsintervals) {
69 first = i*(intervalsz-1);
70 last = first + (intervalsz-1);
71 }
72 else {
73 first = nsintervals*(intervalsz-1) + (i-nsintervals)*intervalsz;
74 last = first + intervalsz;
75 }
76 #else
77 // this is the same logic, but branch-free (and portable)
78 // ...probably unnecessary optimization
79
80 long mask = -long(cast_unsigned(i-nsintervals) >> (NTL_BITS_PER_LONG-1));
81 // mask == -1 if i < nsintervals, 0 o/w
82
83 long lfirst = i*(intervalsz-1);
84 lfirst += long((~cast_unsigned(mask)) & cast_unsigned(i-nsintervals));
85 // lfirst += max(0, i-nsintervals)
86
87 long llast = lfirst + intervalsz + mask;
88
89 first = lfirst;
90 last = llast;
91 #endif
92 }
93
94 };
95
96
97
98 NTL_CLOSE_NNS
99
100
101
102 #ifdef NTL_THREADS
103
104
105 #include <thread>
106 #include <condition_variable>
107 #include <exception>
108
109
110 NTL_OPEN_NNS
111
112 /*************************************************************
113
114 Some simple thread pooling.
115
116 You create a thread pool by constructing a BasicThreadPool object.
117 For example:
118
119 long nthreads = 4;
120 BasicThreadPool pool(nthreads);
121
122 creates a thread pool of 4 threads. These threads will exist
123 until the destructor for pool is called.
124
125 The simplest way to use a thread pools is as follows.
126 Suppose you have a task that consists of N subtasks,
127 indexed 0..N-1. Then you can write:
128
129
130 pool.exec_range(N,
131 [&](long first, long last) {
132 for (long i = first; i < last; i++) {
133 ... code to process subtask i ...
134 }
135 }
136 );
137
138 The second argument to exec1 is a C++11 "lambda".
139 The "[&]" indicates that all local variables in the calling
140 context are captured by reference, so the lambda body can
141 reference all visible local variables directly.
142
143 A lower-level interface is also provided.
144 One can write:
145
146 pool.exec_index(n,
147 [&](long index) {
148 ... code to process index i ...
149 }
150 );
151
152 This will activate n threads with indices 0..n-1, and execute
153 the given code on each index. The parameter n must be
154 in the range 1..nthreads, otherwise an error is raised.
155
156 This lower-level interface is useful in some cases,
157 especially when memory is managed in some special way.
158 For convenience, a method is provided to break
159 subtasks up into smaller, almost-equal-sized groups
160 of subtasks:
161
162 Vec<long> pvec;
163 long n = pool.SplitProblems(N, pvec);
164
165 can be used for this. N is the number of subtasks, indexed 0..N-1.
166 This method will compute n as needed by exec, and
167 the range of subtasks to be processed by a given index in the range
168 0..n-1 is pvec[index]..pvec[index+1]-1
169 Thus, the logic of the above exec1 example can be written
170 using the lower-level exec interface as follows:
171
172
173 Vec<long> pvec;
174 long n = pool.SplitProblems(N, pvec);
175 pool.exec_index(n,
176 [&](long index) {
177 long first = pvec[index];
178 long last = pvec[index+1];
179 for (long i = first; i < last; i++) {
180 ... code to process subtask i ...
181 }
182 }
183 );
184
185 However, with this approach, memory or other resources can be
186 assigned to each index = 0..n-1, and managed externally.
187
188
189
190
191 *************************************************************/
192
193
194 class BasicThreadPool {
195 private:
196
197 // lots of nested stuff
198
199 template<class T>
200 class SimpleSignal {
201 private:
202 T val;
203 std::mutex m;
204 std::condition_variable cv;
205
206 SimpleSignal(const SimpleSignal&); // disabled
207 void operator=(const SimpleSignal&); // disabled
208
209 public:
210 SimpleSignal() : val(0) { }
211
212 T wait()
213 {
214 std::unique_lock<std::mutex> lock(m);
215 cv.wait(lock, [&]() { return val; } );
216 T old_val = val;
217 val = 0;
218 return old_val;
219 }
220
221 void send(T new_val)
222 {
223 std::lock_guard<std::mutex> lock(m);
224 val = new_val;
225 cv.notify_one();
226 }
227 };
228
229
230 template<class T, class T1>
231 class CompositeSignal {
232 private:
233 T val;
234 T1 val1;
235 std::mutex m;
236 std::condition_variable cv;
237
238 CompositeSignal(const CompositeSignal&); // disabled
239 void operator=(const CompositeSignal&); // disabled
240
241 public:
242 CompositeSignal() : val(0) { }
243
244 T wait(T1& _val1)
245 {
246 std::unique_lock<std::mutex> lock(m);
247 cv.wait(lock, [&]() { return val; } );
248 T _val = val;
249 _val1 = val1;
250 val = 0;
251 return _val;
252 }
253
254 void send(T _val, T1 _val1)
255 {
256 std::lock_guard<std::mutex> lock(m);
257 val = _val;
258 val1 = _val1;
259 cv.notify_one();
260 }
261 };
262
263
264
265 class ConcurrentTask {
266 BasicThreadPool *pool;
267 public:
268 ConcurrentTask(BasicThreadPool *_pool) : pool(_pool) { }
269 BasicThreadPool *getBasicThreadPool() const { return pool; }
270
271 virtual void run(long index) = 0;
272 };
273
274
275
276 // dummy class, used for signalling termination
277 class ConcurrentTaskTerminate : public ConcurrentTask {
278 public:
279 ConcurrentTaskTerminate() : ConcurrentTask(0) { }
280 void run(long index) { }
281 };
282
283
284
285 template<class Fct>
286 class ConcurrentTaskFct : public ConcurrentTask {
287 public:
288 const Fct& fct;
289
290 ConcurrentTaskFct(BasicThreadPool *_pool, const Fct& _fct) :
291 ConcurrentTask(_pool), fct(_fct) { }
292
293 void run(long index) { fct(index); }
294 };
295
296 template<class Fct>
297 class ConcurrentTaskFct1 : public ConcurrentTask {
298 public:
299 const Fct& fct;
300 const PartitionInfo& pinfo;
301
302 ConcurrentTaskFct1(BasicThreadPool *_pool, const Fct& _fct,
303 const PartitionInfo& _pinfo) :
304 ConcurrentTask(_pool), fct(_fct), pinfo(_pinfo) { }
305
306 void run(long index)
307 {
308 long first, last;
309 pinfo.interval(first, last, index);
310 fct(first, last);
311 }
312 };
313
314
315
316 struct AutomaticThread {
317 CompositeSignal< ConcurrentTask *, long > localSignal;
318 ConcurrentTaskTerminate term;
319 std::thread t;
320
321
322 AutomaticThread() : t(worker, &localSignal)
323 {
324 // cerr << "starting thread " << t.get_id() << "\n";
325 }
326
327 ~AutomaticThread()
328 {
329 // cerr << "stopping thread " << t.get_id() << "...";
330 localSignal.send(&term, -1);
331 t.join();
332 // cerr << "\n";
333 }
334 };
335
336
337
338 // BasicThreadPool data members
339
340 long nthreads;
341
342 bool active_flag;
343
344 std::atomic<long> counter;
345 SimpleSignal<bool> globalSignal;
346
347 Vec< UniquePtr<AutomaticThread> > threadVec;
348
349 std::exception_ptr eptr;
350 std::mutex eptr_guard;
351
352 // BasicThreadPool private member functions
353
354 BasicThreadPool(const BasicThreadPool&); // disabled
355 void operator=(const BasicThreadPool&); // disabled
356
357 void launch(ConcurrentTask *task, long index)
358 {
359 threadVec[index]->localSignal.send(task, index);
360 }
361
362 void begin(long cnt)
363 {
364
365 active_flag = true;
366 counter = cnt;
367 }
368
369 void end()
370 {
371 globalSignal.wait();
372
373 active_flag = false;
374
375 if (eptr) {
376 std::exception_ptr eptr1 = eptr;
377 eptr = nullptr;
378 std::rethrow_exception(eptr1);
379 }
380 }
381
382 static void runOneTask(ConcurrentTask *task, long index)
383 {
384 BasicThreadPool *pool = task->getBasicThreadPool();
385
386 try {
387 task->run(index);
388 }
389 catch (...) {
390 std::lock_guard<std::mutex> lock(pool->eptr_guard);
391 if (!pool->eptr) pool->eptr = std::current_exception();
392 }
393
394 if (--(pool->counter) == 0) pool->globalSignal.send(true);
395 }
396
397 static void worker(CompositeSignal< ConcurrentTask *, long > *localSignal)
398 {
399 for (;;) {
400 long index = -1;
401 ConcurrentTask *task = localSignal->wait(index);
402 if (index == -1) return;
403
404 runOneTask(task, index);
405 }
406 }
407
408
409 public:
410
411
412 long NumThreads() const { return nthreads; }
413 bool active() const { return active_flag; }
414
415 explicit
416 BasicThreadPool(long _nthreads) :
417 nthreads(_nthreads), active_flag(false), counter(0)
418 {
419 if (nthreads <= 0) LogicError("BasicThreadPool::BasicThreadPool: bad args");
420
421 if (NTL_OVERFLOW(nthreads, 1, 0))
422 ResourceError("BasicThreadPool::BasicThreadPool: arg too big");
423
424 threadVec.SetLength(nthreads-1);
425
426 for (long i = 0; i < nthreads-1; i++) {
427 threadVec[i].make();
428 }
429 }
430
431 ~BasicThreadPool()
432 {
433 if (active()) TerminalError("BasicThreadPool: destructor called while active");
434 }
435
436
437 // adding, deleting, moving threads
438
439 void add(long n = 1)
440 {
441 if (active()) LogicError("BasicThreadPool: illegal operation while active");
442 if (n <= 0) LogicError("BasicThreadPool::add: bad args");
443 if (NTL_OVERFLOW(n, 1, 0))
444 ResourceError("BasicThreadPool::add: arg too big");
445
446 Vec< UniquePtr<AutomaticThread> > newThreads;
447
448 newThreads.SetLength(n);
449 for (long i = 0; i < n; i++)
450 newThreads[i].make();
451
452 threadVec.SetLength(n + nthreads - 1);
453 for (long i = 0; i < n; i++)
454 threadVec[nthreads-1+i].move(newThreads[i]);
455
456 nthreads += n;
457 }
458
459
460 void remove(long n = 1)
461 {
462 if (active()) LogicError("BasicThreadPool: illegal operation while active");
463 if (n <= 0 || n >= nthreads) LogicError("BasicThreadPool::remove: bad args");
464
465 for (long i = nthreads-1-n; i < nthreads-1; i++)
466 threadVec[i] = 0;
467
468 threadVec.SetLength(nthreads-1-n);
469 nthreads -= n;
470 }
471
472
473 void move(BasicThreadPool& other, long n = 1)
474 {
475 if (active() || other.active())
476 LogicError("BasicThreadPool: illegal operation while active");
477 if (n <= 0 || n >= other.nthreads) LogicError("BasicThreadPool::move: bad args");
478
479 if (this == &other) return;
480
481 threadVec.SetLength(n + nthreads - 1);
482 for (long i = 0; i < n; i++)
483 threadVec[nthreads-1+i].move(other.threadVec[other.nthreads-1-n+i]);
484
485 other.threadVec.SetLength(other.nthreads-1-n);
486 other.nthreads -= n;
487
488 nthreads += n;
489 }
490
491
492
493 // High level interfaces, intended to be used with lambdas
494
495 // In this version, fct takes one argument, which is
496 // an index in [0..cnt)
497
498 template<class Fct>
499 void exec_index(long cnt, const Fct& fct)
500 {
501 if (active()) LogicError("BasicThreadPool: illegal operation while active");
502 if (cnt <= 0) return;
503 if (cnt > nthreads) LogicError("BasicThreadPool::exec_index: bad args");
504
505 ConcurrentTaskFct<Fct> task(this, fct);
506
507 begin(cnt);
508 for (long t = 0; t < cnt-1; t++) launch(&task, t);
509 runOneTask(&task, cnt-1);
510 end();
511 }
512
513 template<class Fct>
514 static void relaxed_exec_index(BasicThreadPool *pool, long cnt, const Fct& fct)
515 {
516 if (cnt <= 0) return;
517 if (!pool || pool->active()) {
518 if (cnt > 0) LogicError("friendly_exec_index: not enough threads");
519 fct(0);
520 }
521 else {
522 pool->exec_index(cnt, fct);
523 }
524 }
525
526 // even higher level version: sz is the number of subproblems,
527 // and fct takes two args, first and last, so that subproblems
528 // [first..last) are processed.
529
530 template<class Fct>
531 void exec_range(long sz, const Fct& fct)
532 {
533 if (active()) LogicError("BasicThreadPool: illegal operation while active");
534 if (sz <= 0) return;
535
536 PartitionInfo pinfo(sz, nthreads);
537
538 long cnt = pinfo.NumIntervals();
539 ConcurrentTaskFct1<Fct> task(this, fct, pinfo);
540
541 begin(cnt);
542 for (long t = 0; t < cnt-1; t++) launch(&task, t);
543 runOneTask(&task, cnt-1);
544 end();
545 }
546
547 template<class Fct>
548 static void relaxed_exec_range(BasicThreadPool *pool, long sz, const Fct& fct)
549 {
550 if (sz <= 0) return;
551 if (!pool || pool->active() || sz == 1) {
552 fct(0, sz);
553 }
554 else {
555 pool->exec_range(sz, fct);
556 }
557 }
558
559 };
560
561
562
563
564 NTL_CLOSE_NNS
565
566
567 #endif
568
569
570
571 #ifdef NTL_THREAD_BOOST
572
573 #ifndef NTL_THREADS
574 #error "NTL_THREAD_BOOST requires NTL_THREADS"
575 #endif
576
577 NTL_OPEN_NNS
578
579 extern
580 NTL_CHEAP_THREAD_LOCAL BasicThreadPool *NTLThreadPool_ptr;
581
582 inline
583 BasicThreadPool *GetThreadPool()
584 {
585 return NTLThreadPool_ptr;
586 }
587
588 void ResetThreadPool(BasicThreadPool *pool = 0);
589 BasicThreadPool *ReleaseThreadPool();
590
591 inline void SetNumThreads(long n)
592 {
593 ResetThreadPool(MakeRaw<BasicThreadPool>(n));
594 }
595
596 inline long AvailableThreads()
597 {
598 BasicThreadPool *pool = GetThreadPool();
599 if (!pool || pool->active())
600 return 1;
601 else
602 return pool->NumThreads();
603 }
604
605
606 NTL_CLOSE_NNS
607
608
609 #define NTL_EXEC_RANGE(n, first, last) \
610 { \
611 NTL_NNS BasicThreadPool::relaxed_exec_range(NTL_NNS GetThreadPool(), (n), \
612 [&](long first, long last) { \
613
614
615 #define NTL_EXEC_RANGE_END \
616 } ); \
617 } \
618
619
620 #define NTL_GEXEC_RANGE(seq, n, first, last) \
621 { \
622 NTL_NNS BasicThreadPool::relaxed_exec_range((seq) ? 0 : NTL_NNS GetThreadPool(), (n), \
623 [&](long first, long last) { \
624
625
626 #define NTL_GEXEC_RANGE_END \
627 } ); \
628 } \
629
630
631 #define NTL_EXEC_INDEX(n, index) \
632 { \
633 NTL_NNS BasicThreadPool::relaxed_exec_index(NTL_NNS GetThreadPool(), (n), \
634 [&](long index) { \
635
636
637 #define NTL_EXEC_INDEX_END \
638 } ); \
639 } \
640
641
642
643 // NOTE: at least with gcc >= 4.9.2, the GEXEC versions will evaluate seq, and
644 // if it is true, jump directly (more or less) to the body
645
646
647 #define NTL_TBDECL(x) static void basic_ ## x
648 #define NTL_TBDECL_static(x) static void basic_ ## x
649
650
651 #else
652
653 NTL_OPEN_NNS
654
655
656 inline void SetNumThreads(long n) { }
657
658 inline long AvailableThreads() { return 1; }
659
660
661 NTL_CLOSE_NNS
662
663 #define NTL_EXEC_RANGE(n, first, last) \
664 { \
665 long _ntl_par_exec_n = (n); \
666 if (_ntl_par_exec_n > 0) { \
667 long first = 0; \
668 long last = _ntl_par_exec_n; \
669 { \
670
671
672 #define NTL_EXEC_RANGE_END }}}
673
674 #define NTL_GEXEC_RANGE(seq, n, first, last) \
675 { \
676 long _ntl_par_exec_n = (n); \
677 if (_ntl_par_exec_n > 0) { \
678 long first = 0; \
679 long last = _ntl_par_exec_n; \
680 { \
681
682
683 #define NTL_GEXEC_RANGE_END }}}
684
685
686
687
688 #define NTL_EXEC_INDEX(n, index) \
689 { \
690 long _ntl_par_exec_n = (n); \
691 if (_ntl_par_exec_n > 0) { \
692 if (_ntl_par_exec_n > 1) NTL_NNS LogicError("NTL_EXEC_INDEX: not enough threads"); \
693 long index = 0; \
694 { \
695
696
697 #define NTL_EXEC_INDEX_END }}}
698
699
700
701 #define NTL_TBDECL(x) void x
702 #define NTL_TBDECL_static(x) static void x
703
704 #endif
705
706
707
708 #ifdef NTL_THREADS
709
710 #define NTL_IMPORT(x) auto _ntl_hidden_variable_IMPORT__ ## x = x; auto x = _ntl_hidden_variable_IMPORT__ ##x;
711
712 #else
713
714 #define NTL_IMPORT(x)
715
716
717 #endif
718
719
720
721 #endif
722
3131 Lazy<ZZ> _card;
3232 };
3333
34 NTL_THREAD_LOCAL
35 extern SmartPtr<GF2EInfoT> GF2EInfo; // info for current modulus, initially null
34 extern
35 NTL_CHEAP_THREAD_LOCAL
36 GF2EInfoT *GF2EInfo;
37 // info for current modulus, initially null
38 // fast TLS access
3639
3740
3841
848848 vec_GF2EX H;
849849 };
850850
851 NTL_THREAD_LOCAL
852 extern long GF2EXArgBound;
851 extern
852 NTL_CHEAP_THREAD_LOCAL
853 long GF2EXArgBound;
853854
854855
855856 void build(GF2EXArgument& H, const GF2EX& h, const GF2EXModulus& F, long m);
6868 // Uses "Berlekamp" appraoch.
6969
7070
71 NTL_THREAD_LOCAL
72 extern long GF2EX_BlockingFactor;
71 extern
72 NTL_CHEAP_THREAD_LOCAL
73 long GF2EX_BlockingFactor;
7374 // Controls GCD blocking for DDF.
7475
7576 void DDF(vec_pair_GF2EX_long& factors, const GF2EX& f, const GF2EX& h,
8485 // Assumes f is monic and square-free, and h = X^p mod f
8586 // Obsolete: see NewDDF, below.
8687
87 NTL_THREAD_LOCAL
88 extern long GF2EX_GCDTableSize; /* = 4 */
88 extern
89 NTL_CHEAP_THREAD_LOCAL
90 long GF2EX_GCDTableSize; /* = 4 */
8991 // Controls GCD blocking for NewDDF
9092
9193
92 NTL_THREAD_LOCAL
93 extern double GF2EXFileThresh;
94 extern
95 NTL_CHEAP_THREAD_LOCAL
96 double GF2EXFileThresh;
9497 // external files are used for baby/giant steps if size
9598 // of these tables exceeds GF2EXFileThresh KB.
9699
5555
5656
5757
58 NTL_THREAD_LOCAL static long HexOutput;
58 static NTL_CHEAP_THREAD_LOCAL long HexOutput;
5959
6060 inline GF2X(long i, GF2 c);
6161 inline GF2X(long i, long c);
737737 ~GF2XWatcher() { watched.KillBig(); }
738738 };
739739
740 #define NTL_GF2XRegister(x) NTL_THREAD_LOCAL static GF2X x; GF2XWatcher _WATCHER__ ## x(x)
740 #define NTL_GF2XRegister(x) NTL_TLS_LOCAL(GF2X, x); GF2XWatcher _WATCHER__ ## x(x)
741741
742742
743743
2525
2626 typedef long (*LLLCheckFct)(const vec_ZZ&);
2727
28 NTL_THREAD_LOCAL extern double LLLStatusInterval;
29 NTL_THREAD_LOCAL extern char *LLLDumpFile;
28 extern NTL_CHEAP_THREAD_LOCAL double LLLStatusInterval;
29 extern NTL_CHEAP_THREAD_LOCAL char *LLLDumpFile;
3030
3131
3232 // classical Gramm-Schmidt versions
4949 LazyTable();
5050
5151
52 const T * const operator[] (long i) const;
52 const T * operator[] (long i) const;
5353
5454 ~LazyTable();
5555
9797 public:
9898 LazyTable() : len(0) { }
9999
100 const T * const operator[] (long i) const
100 const T * operator[] (long i) const
101101 {
102102 // FIXME: add optional range checking
103103
4343 const ZZ& mantissa() const { return x; }
4444 long exponent() const { return e; }
4545
46 NTL_THREAD_LOCAL static long prec;
46 static NTL_CHEAP_THREAD_LOCAL long prec;
4747 static void SetPrecision(long p);
4848 static long precision() { return prec; }
4949
50 NTL_THREAD_LOCAL static long oprec;
50 static NTL_CHEAP_THREAD_LOCAL long oprec;
5151 static void SetOutputPrecision(long p);
5252 static long OutputPrecision() { return oprec; }
5353
131131 unsigned long hi, lo;
132132 __asm__ ("mulq %3" : "=a" (lo), "=d" (hi) : "%0" (a), "rm" (b));
133133
134 //__asm__ ("mulxq %2,%1,%0" : "=r" (hi), "=r" (lo) : "rm" (a), "d" (b));
135 // this uses the mulx instruction - no real benefit
136
134137 return hi;
135138 }
136139
201201 if (dp) {
202202 cp = NTL_NEW_OP SmartPtrControlDerived<T>(dp);
203203 if (!cp) {
204 delete dp; // if we throw an exception
204 delete dp; // this could theoretically throw an exception
205205 MemoryError();
206206 }
207207 AddRef();
843843 T* release() { T *p = dp; dp = 0; return p; }
844844 void move(UniquePtr& other) { reset(other.release()); }
845845
846 template<class Y>
847 void move(UniquePtr<Y>& other) { reset(other.release()); }
848
846849 void swap(UniquePtr& other)
847850 {
848851 _ntl_swap(dp, other.dp);
918921 // using psuedo variadic templates
919922
920923 p1.reset(rp); // destroy's p1's referent and assign rp
924
921925
922926 if (p1.exists()) ... // test for null
923927
924928 p1.val() // dereference
925929
930 rp = p1.get(); // fetch raw pointer
931 rp = p1.release(); // fetch raw pointer, and set to NULL
926932 p1.move(p2); // if p1 != p2 then:
927933 // makes p1 point to p2's referent,
928934 // setting p2 to NULL and destroying
983989 T& val() const { return *dp; }
984990
985991 bool exists() const { return dp != 0; }
992
993 T* get() const { return dp.get(); }
994
995 T* release() { return dp.release(); }
986996
987997 void move(OptionalVal& other) { dp.move(other.dp); }
988998
10841094 T& operator[](long i) const { return dp[i]; }
10851095
10861096 T* get() const { return dp; }
1097 T *elts() const { return dp; }
10871098
10881099 T* release() { T *p = dp; dp = 0; return p; }
10891100 void move(UniqueArray& other) { reset(other.release()); }
13441355
13451356
13461357
1358 // AlignedArray:
1359 //
1360 // specialized arrays that have similar interface to UniqueArray, but:
1361 // * they are allocated with a given alignment
1362 // * they (currently) only work on POD types
1363 // the current implementation uses posix_memalign, which seems
1364 // to work on gcc and gcc clones (clang and icc).
1365 // intended for use with Intel AVX intrinsics
1366 //
1367 // For now, this is not a part of the documented interface, and it is only
1368 // works with __GNUC__. If __GNUC__ is not defined, then it reverts to using
1369 // malloc. Currently, it is only really needed if NTL_HAVE_AVX is defined,
1370 // which anyway requires __GNUC__.
1371 //
1372 // This could all change in the future, if and when there is a more portable
1373 // way of doing this.
1374
1375 // NOTE: the methods reset, free, and release are available, but should really
1376 // only be used to move raw pointers around between compatible AlignedArray's.
1377
1378 // NOTE: posix_memalign has been in available since glibc 2.1.91, which is some
1379 // time around the year 2000, so this should be portable.
1380
1381 template<class T, long align=NTL_DEFAULT_ALIGN>
1382 class AlignedArray {
1383 private:
1384 T *dp;
1385
1386 class Dummy { };
1387
1388 typedef void (AlignedArray::*fake_null_type)(Dummy) const;
1389 void fake_null_function(Dummy) const {}
1390
1391 bool cannot_compare_these_types() const { return false; }
1392
1393 AlignedArray(const AlignedArray&); // disabled
1394 void operator=(const AlignedArray&); // disabled
1395
1396 public:
1397 explicit AlignedArray(T *p) : dp(p) { }
1398
1399 AlignedArray() : dp(0) { }
1400
1401 ~AlignedArray() { NTL_SNS free(dp); }
1402
1403
1404 void reset(T* p = 0)
1405 {
1406 AlignedArray tmp(p);
1407 tmp.swap(*this);
1408 }
1409
1410 AlignedArray& operator=(fake_null_type) { reset(); return *this; }
1411
1412 void SetLength(long n)
1413 {
1414 using namespace std;
1415 // not clear if posix_memalign is in std:: or ::
1416 // this will make sure to find it in either case
1417
1418 if (align <= 0 || n < 0) LogicError("AlignedArray::SetLength: bad args");
1419 if (NTL_OVERFLOW1(n, sizeof(T), 0)) ResourceError("AlignedArray::SetLength: overflow");
1420
1421 if (n == 0) {
1422 reset();
1423 }
1424 else
1425 {
1426 void *p;
1427
1428 #ifdef __GNUC__
1429 #define NTL_HAVE_ALIGNED_ARRAY
1430 if (posix_memalign(&p, align, n*sizeof(T))) MemoryError();
1431 #else
1432 p = malloc(n*sizeof(T));
1433 if (!p) MemoryError();
1434 #endif
1435
1436 reset( (T*) p );
1437 }
1438 }
1439
1440 T& operator[](long i) const { return dp[i]; }
1441
1442 T* get() const { return dp; }
1443 T* elts() const { return dp; }
1444
1445 T* release() { T *p = dp; dp = 0; return p; }
1446 void move(AlignedArray& other) { reset(other.release()); }
1447
1448 void swap(AlignedArray& other)
1449 {
1450 _ntl_swap(dp, other.dp);
1451 }
1452
1453 AlignedArray(fake_null_type) : dp(0) { }
1454
1455 operator fake_null_type() const
1456 {
1457 return dp ? &AlignedArray::fake_null_function : 0;
1458 }
1459
1460 };
1461
1462
1463 // free swap function
1464 template<class T, long align>
1465 void swap(AlignedArray<T,align>& p, AlignedArray<T,align>& q) { p.swap(q); }
1466
1467
1468
1469
1470
13471471
13481472
13491473
151151 ~ZZWatcher() { watched.KillBig(); }
152152 };
153153
154 #define NTL_ZZRegister(x) NTL_THREAD_LOCAL static ZZ x; ZZWatcher _WATCHER__ ## x(x)
154 #define NTL_ZZRegister(x) NTL_TLS_LOCAL(ZZ, x); ZZWatcher _WATCHER__ ## x(x)
155155
156156
157157
568568 }
569569
570570
571 // montgomery
572 class ZZ_ReduceStructAdapter {
573 public:
574 UniquePtr<_ntl_reduce_struct> rep;
575
576 void init(const ZZ& p, const ZZ& excess)
577 {
578 rep.reset(_ntl_reduce_struct_build(p.rep, excess.rep));
579 }
580
581 void eval(ZZ& x, ZZ& a) const
582 {
583 rep->eval(&x.rep, &a.rep);
584 }
585
586 void adjust(ZZ& x) const
587 {
588 rep->adjust(&x.rep);
589 }
590 };
591
571592
572593
573594 /*******************************************************
652673
653674 inline ZZ& operator%=(ZZ& x, const ZZ& b)
654675 { rem(x, x, b); return x; }
676
677
678 // preconditioned single-precision variant
679 // not documented for now...
680
681
682 class PreconditionedRemainder {
683 private:
684 long p;
685 UniquePtr<_ntl_general_rem_one_struct> pinfo;
686
687 public:
688 PreconditionedRemainder(long _p, long sz) : p(_p)
689 {
690 pinfo.reset(_ntl_general_rem_one_struct_build(p, sz));
691 }
692
693
694 long operator()(const ZZ& a)
695 {
696 return _ntl_general_rem_one_struct_apply(a.rep, p, pinfo.get());
697 }
698 };
699
655700
656701
657702 /**********************************************************
929974 ************************************************************/
930975
931976
977 // ================ NEW PRG STUFF =================
978
979
980 // Low-level key-derivation
981
982
983 void DeriveKey(unsigned char *key, long klen,
984 const unsigned char *data, long dlen);
985
986
987
988 // Low-level chacha stuff
989
990 #define NTL_PRG_KEYLEN (32)
991
992 class RandomStream {
993 private:
994 _ntl_uint32 state[16];
995 unsigned char buf[64];
996 long pos;
997
998 void do_get(unsigned char *res, long n);
999
1000 public:
1001 explicit
1002 RandomStream(const unsigned char *key);
1003
1004 // No default constructor
1005 // default copy and assignment
1006
1007 void get(unsigned char *res, long n)
1008 {
1009 // optimize short reads
1010 if (n >= 0 && n <= 64-pos) {
1011 long i;
1012 for (i = 0; i < n; i++) {
1013 res[i] = buf[pos+i];
1014 }
1015 pos += n;
1016 }
1017 else {
1018 do_get(res, n);
1019 }
1020 }
1021
1022 };
1023
1024
1025
1026
1027 RandomStream& GetCurrentRandomStream();
1028 // get reference to the current random by stream --
1029 // if SetSeed has not been called, it is called with
1030 // a default value (which should be unique to each
1031 // process/thread
1032
1033
9321034 void SetSeed(const ZZ& s);
1035 void SetSeed(const unsigned char *data, long dlen);
1036 void SetSeed(const RandomStream& s);
9331037 // initialize random number generator
1038 // in the first two version, a PRG key is derived from
1039 // the data using DeriveKey.
1040
1041
1042 // RAII for saving/restoring current state of PRG
1043
1044 class RandomStreamPush {
1045 private:
1046 RandomStream saved;
1047
1048 RandomStreamPush(const RandomStreamPush&); // disable
1049 void operator=(const RandomStreamPush&); // disable
1050
1051 public:
1052 RandomStreamPush() : saved(GetCurrentRandomStream()) { }
1053 ~RandomStreamPush() { SetSeed(saved); }
1054
1055 };
1056
1057
9341058
9351059
9361060 void RandomBnd(ZZ& x, const ZZ& n);
9581082 // single-precision version of the above
9591083
9601084 long RandomBnd(long n);
1085 inline void RandomBnd(long& x, long n) { x = RandomBnd(n); }
9611086
9621087 long RandomLen_long(long l);
1088 inline void RandomLen(long& x, long l) { x = RandomLen_long(l); }
9631089
9641090 long RandomBits_long(long l);
1091 inline void RandomBits(long& x, long l) { x = RandomBits_long(l); }
1092
1093
1094 // specialty routines
9651095
9661096 unsigned long RandomWord();
9671097 unsigned long RandomBits_ulong(long l);
12111341
12121342
12131343 inline long InvModStatus(ZZ& x, const ZZ& a, const ZZ& n)
1214 // if gcd(a,b) = 1, then ReturnValue = 0, x = a^{-1} mod n
1344 // if gcd(a,n) = 1, then ReturnValue = 0, x = a^{-1} mod n
12151345 // otherwise, ReturnValue = 1, x = gcd(a, n)
12161346
12171347 { return NTL_zinv(a.rep, n.rep, &x.rep); }
13291459
13301460 long InvMod(long a, long n);
13311461 // computes a^{-1} mod n. Error is raised if undefined.
1462
1463 long InvModStatus(long& x, long a, long n);
1464 // if gcd(a,n) = 1, then ReturnValue = 0, x = a^{-1} mod n
1465 // otherwise, ReturnValue = 1, x = gcd(a, n)
13321466
13331467 long PowerMod(long a, long e, long n);
13341468 // computes a^e mod n, e >= 0
4040 // f divides a polynomial h whose Euclidean norm
4141 // is bounded by 2^{bnd} in absolute value.
4242
43 NTL_THREAD_LOCAL extern long ZZXFac_MaxPrune;
44 NTL_THREAD_LOCAL extern long ZZXFac_InitNumPrimes;
45 NTL_THREAD_LOCAL extern long ZZXFac_MaxNumPrimes;
46 NTL_THREAD_LOCAL extern long ZZXFac_PowerHack;
47 NTL_THREAD_LOCAL extern long ZZXFac_van_Hoeij;
43 extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxPrune;
44 extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_InitNumPrimes;
45 extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxNumPrimes;
46 extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_PowerHack;
47 extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_van_Hoeij;
4848
4949
5050 void factor(ZZ& c,
2323
2424 long NumPrimes;
2525 long MaxRoot;
26 bool QuickCRT;
2726 ZZ MinusMModP; // -M mod p, M = product of primes
2827 ZZ_CRTStructAdapter crt_struct;
2928 ZZ_RemStructAdapter rem_struct;
3029
3130
3231 // the following arrays are indexed 0..NumPrimes-1
33 // q = FFTPrime[i]
34 Vec<double> x; // u/q, where u = (M/q)^{-1} mod q
35 Vec<long> u; // u, as above
32 // q[i] = FFTPrime[i]
33 Vec<long> prime; // prime[i] = q[i]
34 Vec<double> prime_recip; // prime_recip[i] = 1/double(q[i])
35 Vec<long> u; // u[i] = (M/q[i])^{-1} mod q[i]
3636 Vec<mulmod_precon_t> uqinv;
37
38 ZZ_ReduceStructAdapter reduce_struct;
39
3740 };
3841
3942
6770 ZZ_TmpVecAdapter rem_tmp_vec;
6871 };
6972
70 NTL_THREAD_LOCAL
71 extern SmartPtr<ZZ_pInfoT> ZZ_pInfo;
73
74 extern
75 NTL_CHEAP_THREAD_LOCAL
76 ZZ_pInfoT *ZZ_pInfo;
7277 // info for current modulus, initially null
73
74 NTL_THREAD_LOCAL
75 extern SmartPtr<ZZ_pTmpSpaceT> ZZ_pTmpSpace;
78 // plain pointer for faster TLS access
79
80 extern
81 NTL_CHEAP_THREAD_LOCAL
82 ZZ_pTmpSpaceT *ZZ_pTmpSpace;
7683 // space for temps associated with current modulus,
77
78 NTL_THREAD_LOCAL
79 extern bool ZZ_pInstalled;
84 // plain pointer for faster TLS access
85
86 extern
87 NTL_CHEAP_THREAD_LOCAL
88 bool ZZ_pInstalled;
8089 // flag indicating if current modulus is fully installed
8190
8291
93102
94103 // copy constructor, assignment, destructor: default
95104
96 void save() { ptr = ZZ_pInfo; }
105 void save();
97106 void restore() const;
98107
99108 };
155164
156165 typedef void (*DivHandlerPtr)(const ZZ_p& a); // error-handler for division
157166
158 NTL_THREAD_LOCAL static DivHandlerPtr DivHandler;
167 static
168 NTL_CHEAP_THREAD_LOCAL
169 DivHandlerPtr DivHandler;
159170
160171
161172 // ****** constructors and assignment
206217 static ZZ_pTmpSpaceT* GetTmpSpace()
207218 {
208219 install();
209 return ZZ_pTmpSpace.get();
220 return ZZ_pTmpSpace;
210221 }
211222
212223
521532 ~ZZ_pWatcher() { watched.KillBig(); }
522533 };
523534
524 #define NTL_ZZ_pRegister(x) NTL_THREAD_LOCAL static ZZ_p x; ZZ_pWatcher _WATCHER__ ## x(x); x.allocate()
535 #define NTL_ZZ_pRegister(x) NTL_TLS_LOCAL(ZZ_p, x); ZZ_pWatcher _WATCHER__ ## x(x); x.allocate()
525536
526537 // FIXME: register variables that are allocated with respect to one modulus
527538 // and then reused with another modulus may have initial values that are
3030
3131 };
3232
33 NTL_THREAD_LOCAL
34 extern SmartPtr<ZZ_pEInfoT> ZZ_pEInfo; // info for current modulus, initially null
33 extern
34 NTL_CHEAP_THREAD_LOCAL
35 ZZ_pEInfoT *ZZ_pEInfo;
36 // info for current modulus, initially null
37 // raw pointer for faster TLS access
3538
3639
3740
847847 vec_ZZ_pEX H;
848848 };
849849
850 NTL_THREAD_LOCAL extern long ZZ_pEXArgBound;
850 extern NTL_CHEAP_THREAD_LOCAL long ZZ_pEXArgBound;
851851
852852
853853 void build(ZZ_pEXArgument& H, const ZZ_pEX& h, const ZZ_pEXModulus& F, long m);
3535 // assumes that f is monic and splits into distinct linear factors
3636
3737
38 NTL_THREAD_LOCAL extern long ZZ_pEX_GCDTableSize; /* = 4 */
38 extern
39 NTL_CHEAP_THREAD_LOCAL
40 long ZZ_pEX_GCDTableSize; /* = 4 */
3941 // Controls GCD blocking for NewDDF
4042
41 NTL_THREAD_LOCAL extern double ZZ_pEXFileThresh;
43 extern
44 NTL_CHEAP_THREAD_LOCAL
45 double ZZ_pEXFileThresh;
4246 // of these tables exceeds ZZ_pEXFileThresh KB.
4347
4448
655655 // converts coefficients lo..hi to a 2^k-point FFTRep.
656656 // must have hi-lo+1 < 2^k
657657
658
659 void FromFFTRep(ZZ_pXModRep& x, const FFTRep& a);
660 // for testing and timing purposes only -- converts from FFTRep
661
662 void FromZZ_pXModRep(ZZ_pX& x, const ZZ_pXModRep& a, long lo, long hi);
663 // for testing and timing purposes only -- converts from ZZ_pXModRep
658664
659665
660666
11011107 vec_ZZ_pX H;
11021108 };
11031109
1104 NTL_THREAD_LOCAL extern long ZZ_pXArgBound;
1110 extern NTL_CHEAP_THREAD_LOCAL long ZZ_pXArgBound;
11051111
11061112
11071113 void build(ZZ_pXArgument& H, const ZZ_pX& h, const ZZ_pXModulus& F, long m);
6767 // Uses "Berlekamp" appraoch.
6868
6969
70 NTL_THREAD_LOCAL extern long ZZ_pX_BlockingFactor;
70 extern NTL_CHEAP_THREAD_LOCAL long ZZ_pX_BlockingFactor;
7171 // Controls GCD blocking for DDF.
7272
7373 void DDF(vec_pair_ZZ_pX_long& factors, const ZZ_pX& f, const ZZ_pX& h,
8181 // Assumes f is monic and square-free, and h = X^p mod f
8282 // Obsolete: see NewDDF, below.
8383
84 NTL_THREAD_LOCAL extern long ZZ_pX_GCDTableSize; /* = 4 */
84 extern NTL_CHEAP_THREAD_LOCAL long ZZ_pX_GCDTableSize; /* = 4 */
8585 // Controls GCD blocking for NewDDF
8686
8787
88 NTL_THREAD_LOCAL extern double ZZ_pXFileThresh;
88 extern NTL_CHEAP_THREAD_LOCAL double ZZ_pXFileThresh;
8989 // external files are used for baby/giant steps if size
9090 // of these tables exceeds ZZ_pXFileThresh KB.
9191
8383 #endif
8484
8585 #if 0
86 #define NTL_DISABLE_TLS_HACK
87
88 /* Set if you want to compile NTL without "TLS hack"
89 *
90 * To re-build after changing this flag: rm *.o; make ntl.a
91 */
92
93 #endif
94
95 #if 0
96 #define NTL_ENABLE_TLS_HACK
97
98 /* Set if you want to compile NTL with "TLS hack"
99 *
100 * To re-build after changing this flag: rm *.o; make ntl.a
101 */
102
103 #endif
104
105 #if 0
86106 #define NTL_THREADS
87107
88108 /* Set if you want to compile NTL as a thread-safe library.
103123
104124 #endif
105125
106
107 #if 0
126 #if 0
127 #define NTL_THREAD_BOOST
128
129 /* Set if you want to compile NTL to exploit threads internally.
130 *
131 * To re-build after changing this flag: rm *.o; make ntl.a
132 */
133
134 #endif
135 #
136
137 #if 1
108138 #define NTL_GMP_LIP
109139
110140 /*
147177
148178 #endif
149179
150 #if 0
151 #define NTL_PCLMUL
152
153 /*
154 * Use this flag for faster GF2X arithmetc.
155 * This enables the use of the PCLMUL instruction on x86-64
156 * machines.
157 *
158 * To re-build after changing this flag:
159 * rm GF2X.o; make ntl.a
160 */
161
162 #endif
163180
164181 #if 0
165182 #define NTL_LONG_LONG_TYPE long long
300317 #if 0
301318 #define NTL_DISABLE_LONGDOUBLE
302319
303 /* Explicitly disables us of long double arithmetic in the
304 * single-precision modular arithmetic routines
320 /* Explicitly disables us of long double arithmetic
305321 */
306322
307323 #endif
310326 #if 0
311327 #define NTL_DISABLE_LONGLONG
312328
313 /* Explicitly disables us of long long arithmetic in the
314 * single-precision modular arithmetic routines
315 */
316
317 #endif
318
319
329 /* Explicitly disables us of long long arithmetic
330 */
331
332 #endif
333
334 #if 0
335 #define NTL_DISABLE_LL_ASM
336
337 /* Explicitly disables us of inline assembly as a replacement
338 * for long lobg arithmetic.
339 */
340
341 #endif
342
343
344 #if 0
345 #define NTL_MAXIMIZE_SP_NBITS
346
347 /* Allows for 62-bit single-precision moduli on 64-bit platforms.
348 * By default, such moduli are restricted to 60 bits, which
349 * usually gives slightly better performance across a range of
350 * of parameters.
351 */
352
353 #endif
320354
321355 /*************************************************************************
322356 *
505539 #endif
506540
507541
542 #if 0
543 #define NTL_CRT_ALTCODE
544
545 /*
546 * Employs an alternative CRT strategy.
547 * Only relevant with GMP.
548 * Seems to be marginally faster on some x86_64 platforms.
549 *
550 * To re-build after changing this flag:
551 * rm lip.o; make ntl.a
552 */
553
554 #endif
555
556 #if 0
557 #define NTL_CRT_ALTCODE_SMALL
558
559 /*
560 * Employs an alternative CRT strategy for small moduli.
561 * Only relevant with GMP.
562 * Seems to be marginally faster on some x86_64 platforms.
563 *
564 * To re-build after changing this flag:
565 * rm lip.o; make ntl.a
566 */
567
568 #endif
569
508570
509571 #if 0
510572 #define NTL_GF2X_ALTCODE
547609 #endif
548610
549611
550
551
552
553
554
555
556
557
558 #endif
612 #if 0
613 #define NTL_PCLMUL
614
615 /*
616 * Use this flag for faster GF2X arithmetc.
617 * This enables the use of the PCLMUL instruction on x86-64
618 * machines.
619 *
620 * To re-build after changing this flag:
621 * rm GF2X.o; make ntl.a
622 */
623
624 #endif
625
626
627
628
629
630
631 #endif
33
44 #include <NTL/config.h>
55 #include <NTL/mach_desc.h>
6 #include <NTL/have_LL.h>
7 #include <NTL/have_builtin_clzl.h>
6 #include <NTL/HAVE_LL_TYPE.h>
7 #include <NTL/HAVE_BUILTIN_CLZL.h>
8 #include <NTL/HAVE_AVX.h>
9 #include <NTL/HAVE_FMA.h>
810
911
1012 /*
7375
7476 #endif
7577
78
79 #ifdef NTL_HAVE_LL_TYPE
80
81 typedef NTL_LL_TYPE _ntl_longlong;
82 typedef NTL_ULL_TYPE _ntl_ulonglong;
83 // typenames are more convenient than macros
84
85 #else
86
87 #undef NTL_LL_TYPE
88 #undef NTL_ULL_TYPE
89 // prevent any use of these macros
90
91 class _ntl_longlong { private: _ntl_longlong() { } };
92 class _ntl_ulonglong { private: _ntl_ulonglong() { } };
93 // cannot create variables of these types
94
95
96 #endif
97
7698 /********************************************************/
99
100
101
102 // Define an unsigned type with at least 32 bits
103 // there is no truly portable way to do this, yet...
104
105
106 #if (NTL_BITS_PER_INT >= 32)
107
108 typedef unsigned int _ntl_uint32; // 32-bit word
109 #define NTL_BITS_PER_INT32 NTL_BITS_PER_INT
110
111 #else
112
113 // NOTE: C++ standard guarntees longs ar at least 32-bits wide,
114 // and this is also explicitly checked at builod time
115
116 typedef unsigned long _ntl_uint32; // 32-bit word
117 #define NTL_BITS_PER_INT32 NTL_BITS_PER_LONG
118
119 #endif
120
77121
78122
79123 // The usual token pasting stuff...
280324
281325 #define NTL_THREAD_LOCAL thread_local
282326
327 #ifdef __GNUC__
328 #define NTL_CHEAP_THREAD_LOCAL __thread
329 #else
330 #define NTL_CHEAP_THREAD_LOCAL thread_local
331 #endif
332
283333 #else
284334
285335 #define NTL_THREAD_LOCAL
336 #define NTL_CHEAP_THREAD_LOCAL
286337
287338 #endif
288339
340391 as the C++ standard is kind of broken on the issue of where
341392 swap is defined. And I also only want it defined for built-in types.
342393 */
394
395
396
397
398 // The following is for aligning small local arrays
399 // Equivalent to type x[n], but aligns to align bytes
400 // Only works for POD types
401 // NOTE: the gcc aligned attribute might work, but there is
402 // some chatter on the web that this was (at some point) buggy.
403 // Not clear what the current status is.
404 // Anyway, this is only intended for use with gcc on intel
405 // machines, so it should be OK.
406
407
408 #define NTL_ALIGNED_LOCAL_ARRAY(align, x, type, n) \
409 char x##__ntl_hidden_variable_storage[n*sizeof(type)+align]; \
410 type *x = (type *) ((&x##__ntl_hidden_variable_storage[0]) + \
411 ((-((unsigned long) (&x##__ntl_hidden_variable_storage[0]))) %\
412 (unsigned long)(align))) \
413
414
415 #define NTL_AVX_BYTE_ALIGN (32)
416 #define NTL_AVX_DBL_ALIGN (NTL_AVX_BYTE_ALIGN/long(sizeof(double)))
417
418 #define NTL_AVX_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX_BYTE_ALIGN, x, type, n)
419
420 #define NTL_DEFAULT_ALIGN (64)
421 // this should be big enough to satisfy any SIMD instructions,
422 // and it should also be as big as a cache line
423
424
343425
344426
345427 #endif
8383 #endif
8484
8585 #if 0
86 #define NTL_DISABLE_TLS_HACK
87
88 /* Set if you want to compile NTL without "TLS hack"
89 *
90 * To re-build after changing this flag: rm *.o; make ntl.a
91 */
92
93 #endif
94
95 #if 0
96 #define NTL_ENABLE_TLS_HACK
97
98 /* Set if you want to compile NTL with "TLS hack"
99 *
100 * To re-build after changing this flag: rm *.o; make ntl.a
101 */
102
103 #endif
104
105 #if 0
86106 #define NTL_THREADS
87107
88108 /* Set if you want to compile NTL as a thread-safe library.
103123
104124 #endif
105125
106
107 #if 0
126 #if 0
127 #define NTL_THREAD_BOOST
128
129 /* Set if you want to compile NTL to exploit threads internally.
130 *
131 * To re-build after changing this flag: rm *.o; make ntl.a
132 */
133
134 #endif
135 #
136
137 #if 1
108138 #define NTL_GMP_LIP
109139
110140 /*
147177
148178 #endif
149179
150 #if 0
151 #define NTL_PCLMUL
152
153 /*
154 * Use this flag for faster GF2X arithmetc.
155 * This enables the use of the PCLMUL instruction on x86-64
156 * machines.
157 *
158 * To re-build after changing this flag:
159 * rm GF2X.o; make ntl.a
160 */
161
162 #endif
163180
164181 #if 0
165182 #define NTL_LONG_LONG_TYPE long long
300317 #if 0
301318 #define NTL_DISABLE_LONGDOUBLE
302319
303 /* Explicitly disables us of long double arithmetic in the
304 * single-precision modular arithmetic routines
320 /* Explicitly disables us of long double arithmetic
305321 */
306322
307323 #endif
310326 #if 0
311327 #define NTL_DISABLE_LONGLONG
312328
313 /* Explicitly disables us of long long arithmetic in the
314 * single-precision modular arithmetic routines
315 */
316
317 #endif
318
319
329 /* Explicitly disables us of long long arithmetic
330 */
331
332 #endif
333
334 #if 0
335 #define NTL_DISABLE_LL_ASM
336
337 /* Explicitly disables us of inline assembly as a replacement
338 * for long lobg arithmetic.
339 */
340
341 #endif
342
343
344 #if 0
345 #define NTL_MAXIMIZE_SP_NBITS
346
347 /* Allows for 62-bit single-precision moduli on 64-bit platforms.
348 * By default, such moduli are restricted to 60 bits, which
349 * usually gives slightly better performance across a range of
350 * of parameters.
351 */
352
353 #endif
320354
321355 /*************************************************************************
322356 *
505539 #endif
506540
507541
542 #if 0
543 #define NTL_CRT_ALTCODE
544
545 /*
546 * Employs an alternative CRT strategy.
547 * Only relevant with GMP.
548 * Seems to be marginally faster on some x86_64 platforms.
549 *
550 * To re-build after changing this flag:
551 * rm lip.o; make ntl.a
552 */
553
554 #endif
555
556 #if 0
557 #define NTL_CRT_ALTCODE_SMALL
558
559 /*
560 * Employs an alternative CRT strategy for small moduli.
561 * Only relevant with GMP.
562 * Seems to be marginally faster on some x86_64 platforms.
563 *
564 * To re-build after changing this flag:
565 * rm lip.o; make ntl.a
566 */
567
568 #endif
569
508570
509571 #if 0
510572 #define NTL_GF2X_ALTCODE
547609 #endif
548610
549611
550
551
552
553
554
555
556
557
558 #endif
612 #if 0
613 #define NTL_PCLMUL
614
615 /*
616 * Use this flag for faster GF2X arithmetc.
617 * This enables the use of the PCLMUL instruction on x86-64
618 * machines.
619 *
620 * To re-build after changing this flag:
621 * rm GF2X.o; make ntl.a
622 */
623
624 #endif
625
626
627
628
629
630
631 #endif
2121 #endif
2222
2323
24 #if (defined(NTL_HAVE_LL_TYPE) && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGLONG))
24 #if (defined(NTL_HAVE_LL_TYPE) && !defined(NTL_LEGACY_SP_MULMOD))
2525
2626 #define NTL_LONGLONG_SP_MULMOD
2727
+0
-0
include/NTL/have_LL.h less more
(Empty file)
+0
-0
include/NTL/have_LL_no.h less more
(Empty file)
+0
-4
include/NTL/have_LL_yes.h less more
0 #ifndef NTL_HAVE_LL_TYPE
1 #define NTL_HAVE_LL_TYPE
2 #endif
3
+0
-0
include/NTL/have_builtin_clzl.h less more
(Empty file)
+0
-0
include/NTL/have_builtin_clzl_no.h less more
(Empty file)
+0
-4
include/NTL/have_builtin_clzl_yes.h less more
0 #ifndef NTL_HAVE_BUILTIN_CLZL
1 #define NTL_HAVE_BUILTIN_CLZL
2 #endif
3
5050 _ntl_rem_struct_build(long n, NTL_verylong modulus, long (*p)(long));
5151
5252
53 // montgomery
54 class _ntl_reduce_struct {
55 public:
56 virtual ~_ntl_reduce_struct() { }
57 virtual void eval(NTL_verylong *x, NTL_verylong *a) = 0;
58 virtual void adjust(NTL_verylong *x) = 0;
59 };
60
61 _ntl_reduce_struct *
62 _ntl_reduce_struct_build(NTL_verylong modulus, NTL_verylong excess);
63
64
65 // faster reduction with preconditioning -- general usage, single modulus
66
67 class _ntl_general_rem_one_struct {
68 public:
69 virtual ~_ntl_general_rem_one_struct() { }
70 };
71
72 _ntl_general_rem_one_struct *
73 _ntl_general_rem_one_struct_build(long p, long sz);
74
75 long
76 _ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo);
77
78
5379
5480
5581
2626 mulmod_t pinv;
2727
2828 sp_reduce_struct red_struct;
29 sp_ll_reduce_struct ll_red_struct;
2930
3031 FFTPrimeInfo* p_info; // non-null means we are directly using
3132 // an FFT prime
5758 Vec<mulmod_precon_t> uqinv; // MulModPrecon for u
5859 };
5960
60 NTL_THREAD_LOCAL extern SmartPtr<zz_pInfoT> zz_pInfo; // current modulus, initially null
61 extern
62 NTL_CHEAP_THREAD_LOCAL
63 zz_pInfoT *zz_pInfo;
64 // current modulus, initially null
6165
6266
6367 class zz_pContext {
122126 explicit zz_pPush(const zz_pContext& context) { bak.save(); context.restore(); }
123127
124128 explicit zz_pPush(long p, long maxroot=NTL_FFTMaxRoot)
125 { bak.save(); zz_pContext c(p); c.restore(); }
129 { bak.save(); zz_pContext c(p, maxroot); c.restore(); }
126130
127131 zz_pPush(INIT_FFT_TYPE, long index)
128132 { bak.save(); zz_pContext c(INIT_FFT, index); c.restore(); }
179183 static long modulus() { return zz_pInfo->p; }
180184 static zz_p zero() { return zz_p(); }
181185 static mulmod_t ModulusInverse() { return zz_pInfo->pinv; }
186 static sp_reduce_struct red_struct() { return zz_pInfo->red_struct; }
187 static sp_ll_reduce_struct ll_red_struct() { return zz_pInfo->ll_red_struct; }
182188 static long PrimeCnt() { return zz_pInfo->PrimeCnt; }
183189
184190
422428
423429
424430 void conv(Vec<zz_p>& x, const Vec<ZZ>& a);
425 // explicit instantiation of more efficient version,
431 void conv(Vec<zz_p>& x, const Vec<long>& a);
432 // explicit instantiation of more efficient versions,
426433 // defined in vec_lzz_p.c
427434
428435
441448 /* ------------------------------------- */
442449
443450
451 // *********************************************************
452 // *** specialized inner-product routines, for internal consumption
453 // *********************************************************
454
455 #ifdef NTL_HAVE_LL_TYPE
456 long
457 InnerProd_LL(const long *ap, const zz_p *bp, long n, long d,
458 sp_ll_reduce_struct dinv);
459
460 long
461 InnerProd_LL(const zz_p *ap, const zz_p *bp, long n, long d,
462 sp_ll_reduce_struct dinv);
463 #endif
464
465
466 long
467 InnerProd_L(const long *ap, const zz_p *bp, long n, long d,
468 sp_reduce_struct dinv);
469
470 long
471 InnerProd_L(const zz_p *ap, const zz_p *bp, long n, long d,
472 sp_reduce_struct dinv);
473
444474
445475 NTL_CLOSE_NNS
446476
2929
3030 };
3131
32 NTL_THREAD_LOCAL
33 extern SmartPtr<zz_pEInfoT> zz_pEInfo; // info for current modulus, initially null
32 extern
33 NTL_CHEAP_THREAD_LOCAL
34 zz_pEInfoT *zz_pEInfo;
35 // info for current modulus, initially null
36 // fast TLS access
3437
3538
3639
842842 vec_zz_pEX H;
843843 };
844844
845 NTL_THREAD_LOCAL extern long zz_pEXArgBound;
845 extern
846 NTL_CHEAP_THREAD_LOCAL
847 long zz_pEXArgBound;
846848
847849
848850 void build(zz_pEXArgument& H, const zz_pEX& h, const zz_pEXModulus& F, long m);
3535 // assumes that f is monic and splits into distinct linear factors
3636
3737
38 NTL_THREAD_LOCAL extern long zz_pEX_GCDTableSize; /* = 4 */
38 extern
39 NTL_CHEAP_THREAD_LOCAL
40 long zz_pEX_GCDTableSize; /* = 4 */
3941 // Controls GCD blocking for NewDDF
4042
4143
42 NTL_THREAD_LOCAL extern double zz_pEXFileThresh;
44 extern
45 NTL_CHEAP_THREAD_LOCAL
46 double zz_pEXFileThresh;
4347 // external files are used for baby/giant steps if size
4448 // of these tables exceeds zz_pEXFileThresh KB.
4549
55 #include <NTL/lzz_p.h>
66 #include <NTL/vec_lzz_p.h>
77 #include <NTL/Lazy.h>
8 #include <NTL/SmartPtr.h>
89
910 NTL_OPEN_NNS
1011
10921093 vec_zz_pX H;
10931094 };
10941095
1095 NTL_THREAD_LOCAL extern long zz_pXArgBound;
1096 extern
1097 NTL_CHEAP_THREAD_LOCAL
1098 long zz_pXArgBound;
10961099
10971100
10981101 void build(zz_pXArgument& H, const zz_pX& h, const zz_pXModulus& F, long m);
11051108 inline zz_pX
11061109 CompMod(const zz_pX& g, const zz_pXArgument& H, const zz_pXModulus& F)
11071110 { zz_pX x; CompMod(x, g, H, F); NTL_OPT_RETURN(zz_pX, x); }
1111
1112
1113
1114 // experimental variant that yields a faster ModComp
1115 // Usage:
1116 // zz_pXArgument H;
1117 // build(H, h, F);
1118 // zz_pXAltArgument H1;
1119 // build(H1, H, F); // this keeps a pointer to H, so H must remain alive
1120 // CompMod(x, g, H1, F); // x = g(h) mod f
1121
1122 struct zz_pXAltArgument {
1123
1124 const zz_pXArgument *orig;
1125 zz_pXAltArgument() : orig(0) {}
1126
1127 #ifdef NTL_HAVE_LL_TYPE
1128 long strategy;
1129
1130 long n, m;
1131 Vec< Vec<long> > mem;
1132 Vec<long*> row;
1133
1134 // NOTE: the following two members are used on if
1135 // NTL_HAVE_AVX; however, we declare them unconditionally
1136 // to facilitate the possibility of dynamic linking based
1137 // on architecture
1138 Vec< AlignedArray<double> > dmem;
1139 Vec<double*> drow;
1140
1141 sp_ll_reduce_struct pinv_LL;
1142 sp_reduce_struct pinv_L;
1143 #endif
1144 };
1145
1146
1147 void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F);
1148 void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
1149 const zz_pXModulus& F);
1150
11081151
11091152
11101153
7373
7474
7575
76 NTL_THREAD_LOCAL extern long zz_pX_BlockingFactor;
76 extern
77 NTL_CHEAP_THREAD_LOCAL
78 long zz_pX_BlockingFactor;
7779 // Controls GCD blocking for DDF.
7880
7981
8991 // Obsolete: see NewDDF, below.
9092
9193
92 NTL_THREAD_LOCAL extern long zz_pX_GCDTableSize; /* = 4 */
94 extern
95 NTL_CHEAP_THREAD_LOCAL
96 long zz_pX_GCDTableSize; /* = 4 */
9397 // Controls GCD blocking for NewDDF
9498
9599
5151 long IsIdent(const mat_GF2& A, long n);
5252 void transpose(mat_GF2& X, const mat_GF2& A);
5353 void solve(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b);
54 void solve(ref_GF2 d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b);
5455 void inv(ref_GF2 d, mat_GF2& X, const mat_GF2& A);
5556
5657 inline void sqr(mat_GF2& X, const mat_GF2& A)
3737 void determinant(GF2E& d, const mat_GF2E& A);
3838 long IsIdent(const mat_GF2E& A, long n);
3939 void transpose(mat_GF2E& X, const mat_GF2E& A);
40 void solve(GF2E& d, vec_GF2E& X,
41 const mat_GF2E& A, const vec_GF2E& b);
40 void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b);
41 void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b);
4242 void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A);
4343
4444 inline void sqr(mat_GF2E& X, const mat_GF2E& A)
3434 void determinant(ZZ_p& d, const mat_ZZ_p& A);
3535 long IsIdent(const mat_ZZ_p& A, long n);
3636 void transpose(mat_ZZ_p& X, const mat_ZZ_p& A);
37 void solve(ZZ_p& d, vec_ZZ_p& X,
38 const mat_ZZ_p& A, const vec_ZZ_p& b);
37 void solve(ZZ_p& d, vec_ZZ_p& X, const mat_ZZ_p& A, const vec_ZZ_p& b);
38 void solve(ZZ_p& d, const mat_ZZ_p& A, vec_ZZ_p& x, const vec_ZZ_p& b);
3939 void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A);
4040
4141 inline void sqr(mat_ZZ_p& X, const mat_ZZ_p& A)
4444 inline mat_ZZ_pE transpose(const mat_ZZ_pE& A)
4545 { mat_ZZ_pE X; transpose(X, A); NTL_OPT_RETURN(mat_ZZ_pE, X); }
4646
47 void solve(ZZ_pE& d, vec_ZZ_pE& X,
48 const mat_ZZ_pE& A, const vec_ZZ_pE& b);
47 void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b);
48 void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b);
4949
5050 void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A);
5151
3030 inline mat_zz_p ident_mat_zz_p(long n)
3131 { mat_zz_p X; ident(X, n); NTL_OPT_RETURN(mat_zz_p, X); }
3232
33 void determinant(zz_p& d, const mat_zz_p& A);
3433 long IsIdent(const mat_zz_p& A, long n);
3534 void transpose(mat_zz_p& X, const mat_zz_p& A);
36 void solve(zz_p& d, vec_zz_p& X,
37 const mat_zz_p& A, const vec_zz_p& b);
38 void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A);
35
36
37
38 // ************************
39
40 void relaxed_solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b, bool relax=true);
41 void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax=true);
42
43 void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax=true);
44 inline void relaxed_inv(mat_zz_p& X, const mat_zz_p& A, bool relax=true)
45 { zz_p d; relaxed_inv(d, X, A, relax); if (d == 0) ArithmeticError("inv: non-invertible matrix"); }
46 inline mat_zz_p relaxed_inv(const mat_zz_p& A, bool relax=true)
47 { mat_zz_p X; relaxed_inv(X, A, relax); NTL_OPT_RETURN(mat_zz_p, X); }
48
49 void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax=true);
50 inline zz_p relaxed_determinant(const mat_zz_p& a, bool relax=true)
51 { zz_p x; relaxed_determinant(x, a, relax); return x; }
52
53 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax=true);
54 inline mat_zz_p relaxed_power(const mat_zz_p& A, const ZZ& e, bool relax=true)
55 { mat_zz_p X; relaxed_power(X, A, e, relax); NTL_OPT_RETURN(mat_zz_p, X); }
56 inline void relaxed_power(mat_zz_p& X, const mat_zz_p& A, long e, bool relax=true)
57 { relaxed_power(X, A, ZZ_expo(e), relax); }
58 inline mat_zz_p relaxed_power(const mat_zz_p& A, long e, bool relax=true)
59 { mat_zz_p X; relaxed_power(X, A, e, relax); NTL_OPT_RETURN(mat_zz_p, X); }
60
61 // ***********************
62
63 inline void solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
64 { relaxed_solve(d, x, A, b, false); }
65
66 inline void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b)
67 { relaxed_solve(d, A, x, b, false); }
68
69 inline void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A)
70 { relaxed_inv(d, X, A, false); }
71
72 inline void inv(mat_zz_p& X, const mat_zz_p& A)
73 { relaxed_inv(X, A, false); }
74
75 inline mat_zz_p inv(const mat_zz_p& A)
76 { return relaxed_inv(A, false); }
77
78 inline void determinant(zz_p& d, const mat_zz_p& A)
79 { relaxed_determinant(d, A, false); }
80
81 inline zz_p determinant(const mat_zz_p& a)
82 { return relaxed_determinant(a, false); }
83
84 inline void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e)
85 { relaxed_power(X, A, e, false); }
86
87 inline mat_zz_p power(const mat_zz_p& A, const ZZ& e)
88 { return relaxed_power(A, e, false); }
89
90 inline void power(mat_zz_p& X, const mat_zz_p& A, long e)
91 { relaxed_power(X, A, e, false); }
92
93 inline mat_zz_p power(const mat_zz_p& A, long e)
94 { return relaxed_power(A, e, false); }
95
96 // ************************
97
3998
4099 inline void sqr(mat_zz_p& X, const mat_zz_p& A)
41100 { mul(X, A, A); }
43102 inline mat_zz_p sqr(const mat_zz_p& A)
44103 { mat_zz_p X; sqr(X, A); NTL_OPT_RETURN(mat_zz_p, X); }
45104
46 void inv(mat_zz_p& X, const mat_zz_p& A);
47
48 inline mat_zz_p inv(const mat_zz_p& A)
49 { mat_zz_p X; inv(X, A); NTL_OPT_RETURN(mat_zz_p, X); }
50
51 void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
52 inline mat_zz_p power(const mat_zz_p& A, const ZZ& e)
53 { mat_zz_p X; power(X, A, e); NTL_OPT_RETURN(mat_zz_p, X); }
54
55 inline void power(mat_zz_p& X, const mat_zz_p& A, long e)
56 { power(X, A, ZZ_expo(e)); }
57 inline mat_zz_p power(const mat_zz_p& A, long e)
58 { mat_zz_p X; power(X, A, e); NTL_OPT_RETURN(mat_zz_p, X); }
105
106
59107
60108
61109 void diag(mat_zz_p& X, long n, zz_p d);
74122
75123 // miscellaneous:
76124
77 inline zz_p determinant(const mat_zz_p& a)
78 { zz_p x; determinant(x, a); return x; }
79 // functional variant of determinant
80125
81126 inline mat_zz_p transpose(const mat_zz_p& a)
82127 { mat_zz_p x; transpose(x, a); NTL_OPT_RETURN(mat_zz_p, x); }
4444 inline mat_zz_pE transpose(const mat_zz_pE& A)
4545 { mat_zz_pE X; transpose(X, A); NTL_OPT_RETURN(mat_zz_pE, X); }
4646
47 void solve(zz_pE& d, vec_zz_pE& X,
48 const mat_zz_pE& A, const vec_zz_pE& b);
47 void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b);
48 void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b);
4949
5050 void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A);
5151
6565
6666 inline quad_float& operator=(double x);
6767
68 NTL_THREAD_LOCAL static long oprec;
68 static
69 NTL_CHEAP_THREAD_LOCAL
70 long oprec;
6971
7072 static void SetOutputPrecision(long p);
7173 static long OutputPrecision() { return oprec; }
7779
7880 }; // end class quad_float
7981
82
83
84
8085 #if (NTL_BITS_PER_LONG < NTL_DOUBLE_PRECISION)
8186
8287 // FIXME: we could make this <=, and even BPL <= DP+1 for
8792
8893 #else
8994
95
9096 quad_float to_quad_float(long n);
9197 quad_float to_quad_float(unsigned long n);
9298
9399 #endif
100
101
94102
95103 #if (NTL_BITS_PER_INT < NTL_DOUBLE_PRECISION)
96104
108116
109117
110118
111 inline quad_float to_quad_float(double x) { return quad_float(x, 0); }
112 // On platforms with extended doubles, this may result in an
113 // improper quad_float object, but it should be converted to a proper
114 // one when passed by reference to any of the arithmetic routines,
115 // at which time x will be forced to memory.
119
120 inline quad_float to_quad_float(double x) { return quad_float(TrueDouble(x), 0); }
116121
117122 inline quad_float to_quad_float(float x)
118123 { return to_quad_float(double(x)); }
300300
301301 // **********************************************************************
302302
303
304
305
306
307 #ifdef NTL_HAVE_BUILTIN_CLZL
308
309 static inline long
310 sp_CountLeadingZeros(unsigned long x)
311 {
312 return __builtin_clzl(x);
313 }
314
315 #else
316
317 static inline long
318 sp_CountLeadingZeros(unsigned long x)
319 {
320 long res = NTL_BITS_PER_LONG-NTL_SP_NBITS;
321 x = x << NTL_BITS_PER_LONG-NTL_SP_NBITS;
322 while (x < (1UL << (NTL_BITS_PER_LONG-1))) {
323 x <<= 1;
324 res++;
325 }
326
327 return res;
328 }
329
330
331 #endif
303332
304333
305334
627656 #endif
628657
629658
630
631 #ifdef NTL_HAVE_BUILTIN_CLZL
632
633 static inline long
634 sp_CountLeadingZeros(unsigned long x)
635 {
636 return __builtin_clzl(x);
637 }
638
639 #else
640
641 static inline long
642 sp_CountLeadingZeros(unsigned long x)
643 {
644 long res = NTL_BITS_PER_LONG-NTL_SP_NBITS;
645 x = x << NTL_BITS_PER_LONG-NTL_SP_NBITS;
646 while (x < (1UL << (NTL_BITS_PER_LONG-1))) {
647 x <<= 1;
648 res++;
649 }
650
651 return res;
652 }
653
654
655 #endif
656
657
658
659659 static inline sp_inverse
660660 PrepMulMod(long n)
661661 {
673673 static inline long
674674 sp_NormalizedMulMod(long a, long b, long n, unsigned long ninv)
675675 {
676 NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(a)) * ((NTL_ULL_TYPE) cast_unsigned(b));
677 unsigned long H = U >> (NTL_SP_NBITS-2);
676 ll_type U;
677 ll_imul(U, a, b);
678 unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
678679 unsigned long Q = MulHiUL(H, ninv);
679680 Q = Q >> NTL_POST_SHIFT;
680 unsigned long L = U;
681 unsigned long L = ll_get_lo(U);
681682 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
682683
683684 r = sp_CorrectExcess(r, n);
684685 return r;
685686 }
687
688
686689
687690 static inline long
688691 MulMod(long a, long b, long n, sp_inverse ninv)
708711 static inline long
709712 sp_NormalizedMulModWithQuo(long& qres, long a, long b, long n, unsigned long ninv)
710713 {
711 NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(a)) * ((NTL_ULL_TYPE) cast_unsigned(b));
712 unsigned long H = U >> (NTL_SP_NBITS-2);
714 ll_type U;
715 ll_imul(U, a, b);
716 unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
713717 unsigned long Q = MulHiUL(H, ninv);
714718 Q = Q >> NTL_POST_SHIFT;
715 unsigned long L = U;
719 unsigned long L = ll_get_lo(U);
716720 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
717721
718722 r = sp_CorrectExcessQuo(Q, r, n);
753757
754758 #else
755759
760
756761 static inline unsigned long
757762 sp_NormalizedPrepMulModPrecon(long b, long n, unsigned long ninv)
758763 {
759 NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << NTL_SP_NBITS;
760 unsigned long H = U >> (NTL_SP_NBITS-2);
764 unsigned long H = cast_unsigned(b) << 2;
761765 unsigned long Q = MulHiUL(H, ninv);
762766 Q = Q >> NTL_POST_SHIFT;
763 unsigned long L = U;
767 unsigned long L = cast_unsigned(b) << NTL_SP_NBITS;
764768 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
765769
766770
767771 Q += 1L + sp_SignMask(r-n);
768772 return Q; // NOTE: not shifted
769773 }
774
770775
771776 static inline unsigned long
772777 PrepMulModPrecon(long b, long n, sp_inverse ninv)
10041009 #endif
10051010
10061011
1007
1008
1012 #ifdef NTL_HAVE_LL_TYPE
1013
1014 #define NTL_HAVE_SP_LL_ROUTINES
1015
1016
1017 // some routines that are currently not part of the documented
1018 // interface. They currently are only defined when we have appropriate
1019 // LL type.
1020
1021
1022 struct sp_ll_reduce_struct {
1023 unsigned long inv;
1024 long nbits;
1025
1026 sp_ll_reduce_struct() { }
1027
1028 sp_ll_reduce_struct(unsigned long _inv, long _nbits) : inv(_inv), nbits(_nbits) { }
1029
1030 };
1031
1032
1033 static inline sp_ll_reduce_struct
1034 make_sp_ll_reduce_struct(long n)
1035 {
1036 long nbits = NTL_BITS_PER_LONG - sp_CountLeadingZeros(n);
1037 unsigned long inv =
1038 (unsigned long) ( ((((NTL_ULL_TYPE) 1) << (nbits+NTL_BITS_PER_LONG))-1UL) / ((NTL_ULL_TYPE) n) );
1039
1040 return sp_ll_reduce_struct(inv, nbits);
1041 }
1042
1043
1044 // computes remainder (hi, lo) mod d, assumes hi < d
1045 static inline long
1046 sp_ll_red_21(unsigned long hi, unsigned long lo, long d,
1047 sp_ll_reduce_struct dinv)
1048 {
1049 unsigned long H =
1050 (hi << (NTL_BITS_PER_LONG-dinv.nbits)) | (lo >> dinv.nbits);
1051 unsigned long Q = MulHiUL(H, dinv.inv) + H;
1052 unsigned long rr = lo - Q*cast_unsigned(d); // rr in [0..4*d)
1053 long r = sp_CorrectExcess(rr, 2*d); // r in [0..2*d)
1054 r = sp_CorrectExcess(r, d);
1055 return r;
1056 }
1057
1058 // computes remainder (x[n-1], ..., x[0]) mod d
1059 static inline long
1060 sp_ll_red_n1(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
1061 {
1062 long carry = 0;
1063 long i;
1064 for (i = n-1; i >= 0; i--)
1065 carry = sp_ll_red_21(carry, x[i], d, dinv);
1066 return carry;
1067 }
1068
1069 // computes remainder (x2, x1, x0) mod d, assumes x2 < d
1070 static inline long
1071 sp_ll_red_31(unsigned long x2, unsigned long x1, unsigned long x0,
1072 long d, sp_ll_reduce_struct dinv)
1073 {
1074 long carry = sp_ll_red_21(x2, x1, d, dinv);
1075 return sp_ll_red_21(carry, x0, d, dinv);
1076 }
1077
1078
1079 // normalized versions of the above: assume NumBits(d) == NTL_SP_NBITS
1080
1081 // computes remainder (hi, lo) mod d, assumes hi < d
1082 static inline long
1083 sp_ll_red_21_normalized(unsigned long hi, unsigned long lo, long d,
1084 sp_ll_reduce_struct dinv)
1085 {
1086 unsigned long H =
1087 (hi << (NTL_BITS_PER_LONG-NTL_SP_NBITS)) | (lo >> NTL_SP_NBITS);
1088 unsigned long Q = MulHiUL(H, dinv.inv) + H;
1089 unsigned long rr = lo - Q*cast_unsigned(d); // rr in [0..4*d)
1090 long r = sp_CorrectExcess(rr, 2*d); // r in [0..2*d)
1091 r = sp_CorrectExcess(r, d);
1092 return r;
1093 }
1094
1095 // computes remainder (x[n-1], ..., x[0]) mod d
1096 static inline long
1097 sp_ll_red_n1_normalized(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
1098 {
1099 long carry = 0;
1100 long i;
1101 for (i = n-1; i >= 0; i--)
1102 carry = sp_ll_red_21_normalized(carry, x[i], d, dinv);
1103 return carry;
1104 }
1105
1106 // computes remainder (x2, x1, x0) mod d, assumes x2 < d
1107 static inline long
1108 sp_ll_red_31_normalized(unsigned long x2, unsigned long x1, unsigned long x0,
1109 long d, sp_ll_reduce_struct dinv)
1110 {
1111 long carry = sp_ll_red_21_normalized(x2, x1, d, dinv);
1112 return sp_ll_red_21_normalized(carry, x0, d, dinv);
1113 }
1114
1115
1116 #else
1117
1118 // provided to streamline some code
1119
1120
1121 struct sp_ll_reduce_struct { };
1122
1123
1124 static inline sp_ll_reduce_struct
1125 make_sp_ll_reduce_struct(long n)
1126 {
1127 return sp_ll_reduce_struct();
1128 }
1129
1130 #endif
10091131
10101132
10111133 NTL_CLOSE_NNS
196196 #endif
197197
198198
199
200
201199 const NTL_SNS string& CurrentThreadID();
200
201
202202
203203
204204 /*********************************************************************
0
1
2
03
14 #ifndef NTL_tools__H
25 #define NTL_tools__H
1215
1316 #include <cstdlib>
1417 #include <cmath>
18
19
20
21 #if (defined(NTL_THREADS) && defined(__GNUC__) && !defined(NTL_DISABLE_TLS_HACK))
22 #define NTL_TLS_HACK
23 #endif
24
25
26
27 #ifdef NTL_TLS_HACK
28 #include <pthread.h>
29 #endif
1530
1631
1732
124139
125140
126141
127 #define NTL_FILE_THRESH (128000.0)
128 // threshold in KB for switching to external storage of certain
129 // tables (currently in the DDF polynomial factoring routines)
142 #define NTL_FILE_THRESH (1e12)
143 // threshold in KB for switching to external storage of certain tables
130144
131145
132146
191205 inline long min(long a, int b) { return (a < b) ? a : long(b); }
192206 inline long max(long a, int b) { return (a < b) ? long(b) : a; }
193207
194 #endif
195
196
208 inline unsigned int min(unsigned int a, unsigned int b)
209 { return (a < b) ? a : b; }
210 inline unsigned int max(unsigned int a, unsigned int b)
211 { return (a < b) ? b : a; }
212
213 inline unsigned long min(unsigned long a, unsigned long b)
214 { return (a < b) ? a : b; }
215 inline unsigned long max(unsigned long a, unsigned long b)
216 { return (a < b) ? b : a; }
217
218 inline unsigned long min(unsigned int a, unsigned long b)
219 { return (a < b) ? (unsigned long)(a) : b; }
220 inline unsigned long max(unsigned int a, unsigned long b)
221 { return (a < b) ? b : (unsigned long)(a); }
222
223 inline unsigned long min(unsigned long a, unsigned int b)
224 { return (a < b) ? a : (unsigned long)(b); }
225 inline unsigned long max(unsigned long a, unsigned int b)
226 { return (a < b) ? (unsigned long)(b) : a; }
227
228 #endif
229
230
231 // NOTE: these are here for historical reasons, so I'll leave them
232 // Since it is likely to lead to ambiguities with std::swap,
233 // I am not defining any more of these.
197234 inline void swap(long& a, long& b) { long t; t = a; a = b; b = t; }
198235 inline void swap(int& a, int& b) { int t; t = a; a = b; b = t; }
199236
384421 inline void ForceToMem(double *p) { }
385422
386423 #endif
424
425
426 inline double TrueDouble(double x)
427 {
428 ForceToMem(&x);
429 return x;
430 }
431
387432
388433
389434
485530
486531
487532
488 NTL_THREAD_LOCAL extern void (*ErrorCallback)();
489
490 NTL_THREAD_LOCAL extern void (*ErrorMsgCallback)(const char *);
533 extern NTL_CHEAP_THREAD_LOCAL void (*ErrorCallback)();
534
535 extern NTL_CHEAP_THREAD_LOCAL void (*ErrorMsgCallback)(const char *);
491536
492537
493538 void TerminalError(const char *s);
551596
552597 struct scope_guard_builder {
553598 const char *info;
554 scope_guard_builder(const char *_info) : info(_info) { }
599 explicit scope_guard_builder(const char *_info) : info(_info) { }
555600 };
556601
557602 template < typename F >
584629
585630
586631
632
633 #ifdef NTL_TLS_HACK
634
635
636 namespace details_pthread {
637
638
639 template<class T> void do_delete_aux(T* t) noexcept { delete t; }
640 // an exception here would likely lead to a complete mess...
641 // the noexcept specification should force an immediate termination
642
643 template<class T> void do_delete(void* t) { do_delete_aux((T*)t); }
644
645 using namespace std;
646 // I'm not sure if pthread stuff might be placed in namespace std
647
648 struct key_wrapper {
649 pthread_key_t key;
650
651 key_wrapper(void (*destructor)(void*))
652 {
653 if (pthread_key_create(&key, destructor))
654 ResourceError("pthread_key_create failed");
655 }
656
657 template<class T>
658 T* set(T *p)
659 {
660 if (!p) MemoryError();
661 if (pthread_setspecific(key, p)) {
662 do_delete_aux(p);
663 ResourceError("pthread_setspecific failed");
664 }
665 return p;
666 }
667
668 };
669
670 }
671
672
673 #define NTL_TLS_LOCAL_INIT(type, var, init) \
674 static NTL_CHEAP_THREAD_LOCAL type *_ntl_hidden_variable_tls_local_ptr_ ## var = 0; \
675 type *_ntl_hidden_variable_tls_local_ptr1_ ## var = _ntl_hidden_variable_tls_local_ptr_ ## var; \
676 if (!_ntl_hidden_variable_tls_local_ptr1_ ## var) { \
677 static details_pthread::key_wrapper hidden_variable_key(details_pthread::do_delete<type>); \
678 type *_ntl_hidden_variable_tls_local_ptr2_ ## var = hidden_variable_key.set(NTL_NEW_OP type init); \
679 _ntl_hidden_variable_tls_local_ptr1_ ## var = _ntl_hidden_variable_tls_local_ptr2_ ## var; \
680 _ntl_hidden_variable_tls_local_ptr_ ## var = _ntl_hidden_variable_tls_local_ptr1_ ## var; \
681 } \
682 type &var = *_ntl_hidden_variable_tls_local_ptr1_ ## var \
683
684
685
686 #else
687
688
689 // NOTE: this definition of NTL_TLS_LOCAL_INIT ensures that var names
690 // a local reference, regardless of the implementation
691 #define NTL_TLS_LOCAL_INIT(type,var,init) \
692 static NTL_THREAD_LOCAL type _ntl_hidden_variable_tls_local ## var init; \
693 type &var = _ntl_hidden_variable_tls_local ## var
694
695
696
697
698 #endif
699
700 #define NTL_EMPTY_ARG
701 #define NTL_TLS_LOCAL(type,var) NTL_TLS_LOCAL_INIT(type,var,NTL_EMPTY_ARG)
702
703 #define NTL_TLS_GLOBAL_DECL_INIT(type,var,init) \
704 typedef type _ntl_hidden_typedef_tls_access_ ## var; \
705 static inline \
706 type& _ntl_hidden_function_tls_access_ ## var() { \
707 NTL_TLS_LOCAL_INIT(type,var,init); \
708 return var; \
709 } \
710
711
712 #define NTL_TLS_GLOBAL_DECL(type,var) NTL_TLS_GLOBAL_DECL_INIT(type,var,NTL_EMPTY_ARG)
713
714 #define NTL_TLS_GLOBAL_ACCESS(var) \
715 _ntl_hidden_typedef_tls_access_ ## var & var = _ntl_hidden_function_tls_access_ ## var()
716
717
718 // **************************************************************
719 // Following is code for "long long" arithmetic that can
720 // be implemented using NTL_ULL_TYPE or using assembly.
721 // I have found that the assembly can be a bit faster.
722 // For now, this code is only available if NTL_HAVE_LL_TYPE
723 // is defined. This could change. In any case, this provides
724 // a cleaner interface and might eventually allow for
725 // implementation on systems that don't provide a long long type.
726 // **************************************************************
727
728 #ifdef NTL_HAVE_LL_TYPE
729
730
731 #if (!defined(NTL_DISABLE_LL_ASM) \
732 && defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
733 && defined (__x86_64__) && NTL_BITS_PER_LONG == 64)
734
735 // NOTE: clang's and icc's inline asm code gen is pretty bad, so
736 // we don't even try.
737
738 // FIXME: probably, this should all be properly tested for speed (and correctness)
739 // using the Wizard.
740
741
742 struct ll_type {
743 unsigned long hi, lo;
744 };
745
746
747 static inline void
748 ll_mul_add(ll_type& x, unsigned long a, unsigned long b)
749 {
750 unsigned long hi, lo;
751 __asm__ (
752 "mulq %[b] \n\t"
753 "addq %[lo],%[xlo] \n\t"
754 "adcq %[hi],%[xhi]" :
755 [lo] "=a" (lo), [hi] "=d" (hi), [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
756 [a] "%[lo]" (a), [b] "rm" (b) :
757 "cc"
758 );
759 }
760
761 static inline void
762 ll_imul_add(ll_type& x, unsigned long a, unsigned long b)
763 {
764 unsigned long hi, lo;
765 __asm__ (
766 "imulq %[b] \n\t"
767 "addq %[lo],%[xlo] \n\t"
768 "adcq %[hi],%[xhi]" :
769 [lo] "=a" (lo), [hi] "=d" (hi), [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
770 [a] "%[lo]" (a), [b] "rm" (b) :
771 "cc"
772 );
773 }
774
775 static inline void
776 ll_mul(ll_type& x, unsigned long a, unsigned long b)
777 {
778 __asm__ (
779 "mulq %[b]" :
780 [lo] "=a" (x.lo), [hi] "=d" (x.hi) :
781 [a] "%[lo]" (a), [b] "rm" (b) :
782 "cc"
783 );
784 }
785
786 static inline void
787 ll_imul(ll_type& x, unsigned long a, unsigned long b)
788 {
789 __asm__ (
790 "imulq %[b]" :
791 [lo] "=a" (x.lo), [hi] "=d" (x.hi) :
792 [a] "%[lo]" (a), [b] "rm" (b) :
793 "cc"
794 );
795 }
796
797 static inline void
798 ll_add(ll_type& x, unsigned long a)
799 {
800 __asm__ (
801 "addq %[a],%[xlo] \n\t"
802 "adcq %[z],%[xhi]" :
803 [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
804 [a] "rm" (a), [z] "i" (0) :
805 "cc"
806 );
807 }
808
809
810
811 // NOTE: an optimizing compiler will remove the conditional.
812 // The alternative would be to make a specialization for shamt=0.
813 // Unfortunately, this is impossible to do across a wide range
814 // of compilers and still maintain internal linkage --- it is not
815 // allowed to include static spec in the specialization (new compilers
816 // will complain) and without it, some older compilers will generate
817 // an external symbol. In fact, NTL currently never calls
818 // this with shamt=0, so it is all rather academic...but I want to
819 // keep this general for future use.
820 template<long shamt>
821 static inline unsigned long
822 ll_rshift_get_lo(ll_type x)
823 {
824 if (shamt) {
825 __asm__ (
826 "shrdq %[shamt],%[hi],%[lo]" :
827 [lo] "+r" (x.lo) :
828 [shamt] "i" (shamt), [hi] "r" (x.hi) :
829 "cc"
830 );
831 }
832 return x.lo;
833 }
834
835
836 static inline unsigned long
837 ll_get_lo(const ll_type& x)
838 {
839 return x.lo;
840 }
841
842 static inline unsigned long
843 ll_get_hi(const ll_type& x)
844 {
845 return x.hi;
846 }
847
848
849 static inline void
850 ll_init(ll_type& x, unsigned long a)
851 {
852 x.lo = a;
853 x.hi = 0;
854 }
855
856 #else
857
858
859 typedef NTL_ULL_TYPE ll_type;
860
861 // NOTE: the following functions definitions should serve as
862 // documentation, as well.
863
864 static inline void
865 ll_mul_add(ll_type& x, unsigned long a, unsigned long b)
866 {
867 x += ((ll_type) a)*((ll_type) b);
868 }
869
870 // a and b should be representable as positive long's,
871 // to allow for the most flexible implementation
872 static inline void
873 ll_imul_add(ll_type& x, unsigned long a, unsigned long b)
874 {
875 x += ((ll_type) long(a))*((ll_type) long(b));
876 }
877 static inline void
878 ll_mul(ll_type& x, unsigned long a, unsigned long b)
879 {
880 x = ((ll_type) a)*((ll_type) b);
881 }
882
883 // a and b should be representable as positive long's,
884 // to allow for the most flexible implementation
885 static inline void
886 ll_imul(ll_type& x, unsigned long a, unsigned long b)
887 {
888 x = ((ll_type) long(a))*((ll_type) long(b));
889 }
890
891 static inline void
892 ll_add(ll_type& x, unsigned long a)
893 {
894 x += a;
895 }
896
897 template<long shamt>
898 static inline unsigned long
899 ll_rshift_get_lo(const ll_type& x)
900 {
901 return ((unsigned long) (x >> shamt));
902 }
903
904 static inline unsigned long
905 ll_get_lo(const ll_type& x)
906 {
907 return ((unsigned long) x);
908 }
909
910 static inline unsigned long
911 ll_get_hi(const ll_type& x)
912 {
913 return ((unsigned long) (x >> NTL_BITS_PER_LONG));
914 }
915
916
917 static inline void
918 ll_init(ll_type& x, unsigned long a)
919 {
920 x = a;
921 }
922
923
924 #endif
925
926
927
928 #endif
929
930
931
932
933
587934 NTL_CLOSE_NNS
588935
589936
5858 {
5959 for (long i = 0; i < n; i++)
6060 p[i].~T();
61
62 // NOTE: this routine is only invoked through a Vec destructor
63 // or a scope guard destructor, both of which are noexcept destructors.
64 // therefore, if ~T() should throw, the program will terminate
6165 }
6266
6367
11 #ifndef NTL_version__H
22 #define NTL_version__H
33
4 #define NTL_VERSION "9.3.0"
4 #define NTL_VERSION "9.9.0"
55
66 #define NTL_MAJOR_VERSION (9)
7 #define NTL_MINOR_VERSION (3)
7 #define NTL_MINOR_VERSION (9)
88 #define NTL_REVISION (0)
99
1010 #endif
4141
4242 void normalize();
4343
44 NTL_THREAD_LOCAL static long oprec;
44 static
45 NTL_CHEAP_THREAD_LOCAL
46 long oprec;
4547
4648 static void SetOutputPrecision(long p);
4749 static long OutputPrecision() { return oprec; }
0
1 #include <NTL/BasicThreadPool.h>
2
3 #ifdef NTL_THREAD_BOOST
4
5 NTL_START_IMPL
6
7
8 NTL_TLS_GLOBAL_DECL(UniquePtr<BasicThreadPool>, NTLThreadPool_stg)
9
10 NTL_CHEAP_THREAD_LOCAL BasicThreadPool *NTLThreadPool_ptr = 0;
11
12 void ResetThreadPool(BasicThreadPool *pool)
13 {
14 NTL_TLS_GLOBAL_ACCESS(NTLThreadPool_stg);
15 NTLThreadPool_stg.reset(pool);
16 NTLThreadPool_ptr = pool;
17 }
18
19 BasicThreadPool *ReleaseThreadPool()
20 {
21 NTL_TLS_GLOBAL_ACCESS(NTLThreadPool_stg);
22 BasicThreadPool *pool = NTLThreadPool_stg.release();
23 NTLThreadPool_ptr = 0;
24 return pool;
25 }
26
27
28
29 NTL_END_IMPL
30
31 #endif
7676 random(a, n, m);
7777
7878 t = GetTime();
79 kernel(x, a);
79 image(x, a);
8080 t = GetTime() - t; cerr << t << "\n";
8181
8282 cvt(A, a);
8383
8484 t = GetTime();
85 kernel(X, A);
85 image(X, A);
8686 t = GetTime() - t; cerr << t << "\n";
87
88 cerr << x.NumRows() << "\n";
8987
9088 cvt(X1, x);
9189
9290 if (X1 != X) TerminalError("BitMatTest NOT OK!!");
93
94 if (!IsZero(X*A)) TerminalError("BitMatTest NOT OK!!");
9591
9692 cerr << "\n";
9793 }
0 #include <NTL/ctools.h>
1
2 #include <cstdlib>
3 #include <immintrin.h>
4 #include <iostream>
5
6
7 #if (!defined(__GNUC__) || !defined(__x86_64__) || !defined(__AVX__))
8 #error "AVX not supported"
9 #endif
10
11 #if (NTL_BITS_PER_LONG != 64 || NTL_DOUBLE_PRECISION != 53)
12 #error "AVX not supported"
13 // sanity check -- code that uses this feature also relies on this
14 #endif
15
16 using namespace std;
17
18 void fun(double * x, const double *a, const double *b)
19 {
20 __m256d xvec, avec, bvec, cvec;
21
22 avec = _mm256_load_pd(a);
23 bvec = _mm256_load_pd(b);
24 xvec = _mm256_load_pd(x);
25
26 xvec = _mm256_add_pd(_mm256_mul_pd(avec, bvec), xvec);
27
28 _mm256_store_pd(x, xvec);
29 }
30 int main()
31 {
32 NTL_AVX_LOCAL_ARRAY(vp, double, 12);
33
34 double *a = vp + 0*4;
35 double *b = vp + 1*4;
36 double *x = vp + 2*4;
37
38 a[0] = atoi("1");
39 a[1] = atoi("2");
40 a[2] = atoi("3");
41 a[3] = atoi("4");
42
43 b[0] = atoi("2");
44 b[1] = atoi("3");
45 b[2] = atoi("4");
46 b[3] = atoi("5");
47
48 x[0] = atoi("3");
49 x[1] = atoi("4");
50 x[2] = atoi("5");
51 x[3] = atoi("6");
52
53 fun(x, a, b);
54
55 if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26)
56 return 0;
57 else
58 return -1;
59 }
60
61
62
0
1 int main() { return 0; }
0
1 #include <NTL/ctools.h>
2
3 #include <cstdlib>
4 #include <immintrin.h>
5 #include <iostream>
6
7
8 #if (!defined(__GNUC__) || !defined(__x86_64__) || !defined(__AVX2__))
9 #error "AVX2 with FMA not supported"
10 #endif
11
12 #if (NTL_BITS_PER_LONG != 64 || NTL_DOUBLE_PRECISION != 53)
13 #error "AVX2 with FMA not supported"
14 // sanity check -- code that uses this feature also relies on this
15 #endif
16
17 using namespace std;
18
19
20 void fun(double * x, const double *a, const double *b)
21 {
22 __m256d xvec, avec, bvec, cvec;
23
24 avec = _mm256_load_pd(a);
25 bvec = _mm256_load_pd(b);
26 xvec = _mm256_load_pd(x);
27
28 xvec = _mm256_fmadd_pd(avec, bvec, xvec);
29
30 _mm256_store_pd(x, xvec);
31 }
32 int main()
33 {
34 NTL_AVX_LOCAL_ARRAY(vp, double, 12);
35
36 double *a = vp + 0*4;
37 double *b = vp + 1*4;
38 double *x = vp + 2*4;
39
40 a[0] = atoi("1");
41 a[1] = atoi("2");
42 a[2] = atoi("3");
43 a[3] = atoi("4");
44
45 b[0] = atoi("2");
46 b[1] = atoi("3");
47 b[2] = atoi("4");
48 b[3] = atoi("5");
49
50 x[0] = atoi("3");
51 x[1] = atoi("4");
52 x[2] = atoi("5");
53 x[3] = atoi("6");
54
55 fun(x, a, b);
56
57 if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26)
58 return 0;
59 else
60 return -1;
61 }
62
63
64
0
1
2 #define NTL_HAVE_LL_TYPE
3 // DIRT: we need to define this here so that ctools.h
4 // does not undefine the LL type macros
5
06 #include <NTL/ctools.h>
7
8 #ifdef NTL_DISABLE_LONGLONG
9 #error "LL_TYPE disabled"
10 #endif
111
212 void touch(unsigned long& x);
313 void touch(long& x);
+0
-54
src/CheckPCLMUL.c less more
0 #include <iostream>
1 #include <wmmintrin.h>
2
3
4 using namespace std;
5
6
7
8 void
9 pclmul_mul1 (unsigned long *c, unsigned long a, unsigned long b)
10 {
11 __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0));
12 __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
13 _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
14 }
15
16
17 int main()
18 {
19 cout << "Running CheckPCLMUL...";
20
21 // make sure longs are 64 bit
22 // this runs before mach_desc.h is built, so we calculate
23 // bits-per-long here...in not quite as paranoid a fashion
24 // as in MakeDesc.c. On any standard-compliant compiler,
25 // it should be correct.
26
27 unsigned long ulval = 1;
28 long bpl = 0;
29
30 while (ulval) {
31 ulval <<= 1;
32 bpl++;
33 }
34
35 if (bpl != 64) {
36 cout << "bad (only works with 64-bit longs)\n";
37 return 1;
38 }
39
40 unsigned long c[2], a, b;
41 a = 3;
42 b = 3;
43 pclmul_mul1(c, a, b);
44 if (c[0] == 5 && c[1] == 0) {
45 cout << "good\n";
46 return 0;
47 }
48 else {
49 cout << "bad\n";
50 return 1;
51 }
52 }
53
0
1 cp "$1/include/NTL/HAVE_LL_TYPE.h" "$2/include/NTL/HAVE_LL_TYPE.h"
2 cp "$1/include/NTL/HAVE_BUILTIN_CLZL.h" "$2/include/NTL/HAVE_BUILTIN_CLZL.h"
3 cp "$1/include/NTL/HAVE_AVX.h" "$2/include/NTL/HAVE_AVX.h"
4 cp "$1/include/NTL/HAVE_FMA.h" "$2/include/NTL/HAVE_FMA.h"
0 ntl-9.3.0
0 ntl-9.9.0
1212 {
1313
1414 cout << "\n\n";
15 cout << "/***************************\n";
1516 cout << "Basic Configuration Options:\n";
1617
1718
2930 cout << "NTL_THREADS\n";
3031 #endif
3132
33 #ifdef NTL_DISABLE_TLS_HACK
34 cout << "NTL_DISABLE_TLS_HACK\n";
35 #endif
36
37 #ifdef NTL_ENABLE_TLS_HACK
38 cout << "NTL_ENABLE_TLS_HACK\n";
39 #endif
40
3241 #ifdef NTL_EXCEPTIONS
3342 cout << "NTL_EXCEPTIONS\n";
43 #endif
44
45 #ifdef NTL_THREAD_BOOST
46 cout << "NTL_THREAD_BOOST\n";
3447 #endif
3548
3649
4154
4255 #ifdef NTL_GF2X_LIB
4356 cout << "NTL_GF2X_LIB\n";
44 #endif
45
46 #ifdef NTL_PCLMUL
47 cout << "NTL_PCLMUL\n";
4857 #endif
4958
5059 #ifdef NTL_LONG_LONG_TYPE
96105 cout << "NTL_DISABLE_LONGLONG\n";
97106 #endif
98107
108 #ifdef NTL_DISABLE_LL_ASM
109 cout << "NTL_DISABLE_LL_ASM\n";
110 #endif
111
112 #ifdef NTL_MAXIMIZE_SP_NBITS
113 cout << "NTL_MAXIMIZE_SP_NBITS\n";
114 #endif
99115
100116
101117 cout << "\n";
147163 cout << "NTL_TBL_REM_LL\n";
148164 #endif
149165
166 #ifdef NTL_CRT_ALTCODE
167 cout << "NTL_CRT_ALTCODE\n";
168 #endif
169
170 #ifdef NTL_CRT_ALTCODE_SMALL
171 cout << "NTL_CRT_ALTCODE_SMALL\n";
172 #endif
173
150174
151175 #ifdef NTL_GF2X_ALTCODE
152176 cout << "NTL_GF2X_ALTCODE\n";
162186 cout << "NTL_GF2X_NOINLINE\n";
163187 #endif
164188
189 #ifdef NTL_PCLMUL
190 cout << "NTL_PCLMUL\n";
191 #endif
192
193
194 cout << "***************************/\n";
165195 cout << "\n\n";
166196
167197 return 0;
22 # use warnings; # this doesn't work on older versions of perl
33
44
5 sub RemoveProg {
6
7 # This should work on unix and cygwin on windows
8
9 my ($name) = @_;
10
11 unlink($name); unlink("$name.exe");
12
13 }
14
15 sub BadPCLMUL {
16 print "*\n*\nPCLMUL does not work on this system\n";
17 print "reconfiguring with NTL_PCLMUL=off...\n*\n*\n";
18 system("echo 'NTL_PCLMUL=off' > RETRY_CONFIG");
19 exit 1;
20 }
215
226 %MakeFlag = (
237
248 'WIZARD' => 'on',
259 'SHARED' => 'off',
10 'NATIVE' => 'on'
2611
2712 );
2813
3015
3116 'CXX' => 'g++',
3217 'CXXFLAGS' => '-g -O2',
18 'CXXAUTOFLAGS'=> '',
3319 'AR' => 'ar',
3420 'ARFLAGS' => 'ruv',
3521 'RANLIB' => 'ranlib',
6147
6248 %ConfigFlag = (
6349
64 'NTL_LEGACY_NO_NAMESPACE' => 'off',
65 'NTL_LEGACY_INPUT_ERROR' => 'off',
50 'NTL_LEGACY_NO_NAMESPACE' => 'off',
51 'NTL_LEGACY_INPUT_ERROR' => 'off',
6652 'NTL_DISABLE_LONGDOUBLE' => 'off',
67 'NTL_DISABLE_LONGLONG' => 'off',
68 'NTL_LEGACY_SP_MULMOD' => 'off',
69 'NTL_THREADS' => 'off',
70 'NTL_EXCEPTIONS' => 'off',
71 'NTL_GMP_LIP' => 'off',
72 'NTL_GF2X_LIB' => 'off',
73 'NTL_PCLMUL' => 'off',
74 'NTL_X86_FIX' => 'off',
75 'NTL_NO_X86_FIX' => 'off',
76 'NTL_AVOID_FLOAT' => 'off',
77 'NTL_LONG_LONG' => 'off',
78 'NTL_SPMM_ULL' => 'off',
79 'NTL_SPMM_ASM' => 'off',
80 'NTL_AVOID_BRANCHING' => 'off',
81 'NTL_TBL_REM' => 'off',
82 'NTL_TBL_REM_LL' => 'off',
83 'NTL_GF2X_NOINLINE' => 'off',
84 'NTL_GF2X_ALTCODE' => 'off',
85 'NTL_GF2X_ALTCODE1' => 'off',
86 'NTL_NO_INIT_TRANS' => 'off',
87 'NTL_CLEAN_INT' => 'off',
88 'NTL_CLEAN_PTR' => 'off',
89 'NTL_RANGE_CHECK' => 'off',
90 'NTL_FFT_BIGTAB' => 'off',
91 'NTL_FFT_LAZYMUL' => 'off',
53 'NTL_DISABLE_LONGLONG' => 'off',
54 'NTL_DISABLE_LL_ASM' => 'off',
55 'NTL_MAXIMIZE_SP_NBITS' => 'off',
56 'NTL_LEGACY_SP_MULMOD' => 'off',
57 'NTL_THREADS' => 'off',
58 'NTL_DISABLE_TLS_HACK' => 'off',
59 'NTL_ENABLE_TLS_HACK' => 'off',
60 'NTL_EXCEPTIONS' => 'off',
61 'NTL_THREAD_BOOST' => 'off',
62 'NTL_GMP_LIP' => 'on',
63 'NTL_GF2X_LIB' => 'off',
64 'NTL_X86_FIX' => 'off',
65 'NTL_NO_X86_FIX' => 'off',
66 'NTL_AVOID_FLOAT' => 'off',
67 'NTL_LONG_LONG' => 'off',
68 'NTL_SPMM_ULL' => 'off',
69 'NTL_SPMM_ASM' => 'off',
70 'NTL_AVOID_BRANCHING' => 'off',
71 'NTL_TBL_REM' => 'off',
72 'NTL_TBL_REM_LL' => 'off',
73 'NTL_CRT_ALTCODE' => 'off',
74 'NTL_CRT_ALTCODE_SMALL' => 'off',
75 'NTL_GF2X_NOINLINE' => 'off',
76 'NTL_GF2X_ALTCODE' => 'off',
77 'NTL_GF2X_ALTCODE1' => 'off',
78 'NTL_PCLMUL' => 'off',
79 'NTL_NO_INIT_TRANS' => 'off',
80 'NTL_CLEAN_INT' => 'off',
81 'NTL_CLEAN_PTR' => 'off',
82 'NTL_RANGE_CHECK' => 'off',
83 'NTL_FFT_BIGTAB' => 'off',
84 'NTL_FFT_LAZYMUL' => 'off',
9285
9386 );
9487
109102
110103 if ($arg =~ '^(-h|help|-help|--help)$') {
111104 system("more ../doc/config.txt");
112 exit;
105 exit 0;
113106 }
114107
115108 if ($arg =~ '^--nowrite$') {
144137
145138 }
146139
147 # special processing for NTL_THREADS: if this is set, we override
148 # the default setting for CXXFLAGS
149
150 if ($ConfigFlag{'NTL_THREADS'} eq 'on' && !exists($Variable{'CXXFLAGS'})) {
151 $MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -std=c++11 -pthread';
152 }
153
154 # special processing for NTL_EXCEPTIONS: similar to processing
155 # for NTL_THREADS
156
157 if ($ConfigFlag{'NTL_EXCEPTIONS'} eq 'on' && $ConfigFlag{'NTL_THREADS'} eq 'off' && !exists($Variable{'CXXFLAGS'})) {
158 $MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -std=c++11';
159 }
160
161
162 # special processing for NTL_PCLMUL: if set, add -mpclmul to CXXFLAGS
163
164 if ($ConfigFlag{'NTL_PCLMUL'} eq 'on' && !exists($Variable{'CXXFLAGS'})) {
165 $MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -mpclmul';
166 }
140 # special processing: NTL_THREAD_BOOST => NTL_THREADS
141
142 if ($ConfigFlag{'NTL_THREAD_BOOST'} eq 'on') {
143 $ConfigFlag{'NTL_THREADS'} = 'on';
144 }
145
146 if ($ConfigFlag{'NTL_THREADS'} eq 'on' && $ConfigFlag{'NTL_GMP_LIP'} eq 'off') {
147 die "Error: NTL_THREADS currently only available with NTL_GMP_LIP...sorry\n";
148 }
149
167150
168151
169152 # some special MakeVal values that are determined by SHARED
194177 $MakeVal{'GMPL'} = '# ';
195178 $MakeVal{'GMP'} = '# ';
196179
197 if ($ConfigFlag{'NTL_GMP_LIP'} eq 'on' || $ConfigFlag{'NTL_GMP_HACK'} eq 'on') {
180 if ($ConfigFlag{'NTL_GMP_LIP'} eq 'on') {
198181 $MakeVal{'GMP'} = '';
199182 if (exists($Variable{'DEF_PREFIX'}) ||
200183 exists($Variable{'GMP_PREFIX'}) ||
296279
297280 }
298281
299 if ($ConfigSub{'NTL_GMP_HACK'} + $ConfigSub{'NTL_GMP_LIP'} > 1) {
300
301 die "Error: at most one of NTL_GMP_HACK and NTL_GMP_LIP may be on\n";
302
303 }
304282
305283 if ($ConfigSub{'NTL_AVOID_FLOAT'} + $ConfigSub{'NTL_LONG_LONG'} > 1) {
306284
324302
325303
326304
327 ######################################
328
329 # all tests pass -- generate files
330
331 ######################################
332
333 # generate makefile
334
335 open(MFILE, "< mfile");
336 open(MFILEOUT, "> mfileout");
337
338 while ($line = <MFILE>) {
339
340 $line =~ s/@\{(.*?)\}/$MakeSub{$1}/ge;
341
342 print MFILEOUT $line;
343
344 }
345
346 close(MFILE);
347 close(MFILEOUT);
348
349
350 # generate config.h
351
352 open(CFILE, "< cfile");
353 open(CFILEOUT, "> cfileout");
354
355 while ($line = <CFILE>) {
356
357 $line =~ s/@\{(.*?)\}/$ConfigSub{$1}/ge;
358
359 print CFILEOUT $line;
360
361 }
362
363 close(CFILE);
364 close(CFILEOUT);
365
366 print("CXXFLAGS=\"$MakeVal{'CXXFLAGS'}\"\n");
367
368 if ($nowrite == 0) {
369
370 print("writing makefile\n");
305
306 #
307 #
308 #code to set CXXAUTOFLAGS
309
310 sub RemoveProg {
311 # This should work on unix and cygwin on windows
312
313 my ($name) = @_;
314 unlink($name); unlink("$name.exe");
315 return 1;
316 }
317
318 sub GenFiles {
319
320 open(MFILE, "< mfile");
321 open(MFILEOUT, "> mfileout");
322
323 while ($line = <MFILE>) {
324
325 $line =~ s/@\{(.*?)\}/$MakeSub{$1}/ge;
326
327 print MFILEOUT $line;
328
329 }
330
331 close(MFILE);
332 close(MFILEOUT);
333
334
335 # generate config.h
336
337
338 open(CFILE, "< cfile");
339 open(CFILEOUT, "> cfileout");
340
341 while ($line = <CFILE>) {
342
343 $line =~ s/@\{(.*?)\}/$ConfigSub{$1}/ge;
344
345 print CFILEOUT $line;
346
347 }
348
349 close(CFILE);
350 close(CFILEOUT);
351
352 open(HFILEOUT, "> hfileout");
353 $argstr = join(' ', @ARGV);
354 print HFILEOUT "// generated by ./configure $argstr\n";
355 print HFILEOUT "// CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\" \n";
356 close(HFILEOUT);
357
358
359 return 1;
360 }
361
362 sub CopyFiles {
363
371364 system("cp mfileout makefile");
372
373 print("writing ../include/NTL/config.h\n");
374365 system("cp cfileout ../include/NTL/config.h");
375
376 if ($ConfigFlag{'NTL_PCLMUL'} eq 'on') {
377 print "*\n*\nNTL_PCLMUL=on => checking system compatibility...\n\n";
378 RemoveProg("CheckPCLMUL");
379 system("make CheckPCLMUL") and BadPCLMUL();
380 system("./CheckPCLMUL") and BadPCLMUL();
381 print "*\n*\nPCLMUL works on this system\n";
382 }
383
384 }
385
366 system("cp hfileout ../include/NTL/config_log.h");
367
368 return 1;
369 }
370
371 sub CheckCompile {
372 GenFiles();
373 CopyFiles();
374 RemoveProg("CheckCompile");
375 system("make CheckCompile >> CheckFlag.log 2>&1") and return 0;
376 system("./CheckCompile") and RemoveProg("CheckCompile") and return 0;
377 RemoveProg("CheckCompile");
378 return 1;
379 }
380
381 sub CheckFlag {
382 my ($flag) = @_;
383 my $try_flags = $MakeSub{'CXXAUTOFLAGS'};
384 print "*** checking $flag flag\n";
385 $MakeSub{'CXXAUTOFLAGS'} = $MakeSub{'CXXAUTOFLAGS'} . ' ' . $flag;
386 print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
387 if (CheckCompile()) {
388 print "*** $flag works\n";
389 }
390 else {
391 $MakeSub{'CXXAUTOFLAGS'} = $try_flags;
392 print "*** $flag does not work\n";
393 }
394 return 1;
395 }
396
397
398
399 if ($nowrite) {
400 GenFiles();
401 exit 0;
402 }
403
404
405 if (exists($Variable{'CXXAUTOFLAGS'})) {
406 print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
407 GenFiles();
408 CopyFiles();
409 exit 0;
410 }
411
412 $std_flag = 0;
413 $pthread_flag = 0;
414 $native_flag = 0;
415
416 # special processing for NTL_THREADS
417
418 if ($ConfigFlag{'NTL_THREADS'} eq 'on') {
419 $std_flag = 1; # ' -std=c++11';
420 $pthread_flag = 1; # ' -pthread';
421 }
422
423 # special processing for NTL_EXCEPTIONS
424
425 if ($ConfigFlag{'NTL_EXCEPTIONS'} eq 'on') {
426 $std_flag = 1; # ' -std=c++11';
427 }
428
429 # special processing for NATIVE
430
431 if ($MakeFlag{'NATIVE'} eq 'on') {
432 $native_flag = 1; # ' -march=native';
433 }
434
435
436 system("echo '*** CheckFlag log ***' > CheckFlag.log");
437
438 if ($std_flag) {
439 CheckFlag('-std=c++11');
440 }
441
442 if ($pthread_flag) {
443 CheckFlag('-pthread');
444 }
445
446 if ($native_flag) {
447 CheckFlag('-march=native');
448 }
449
450 print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
451 print("generating makefile\n");
452 print("generating ../include/NTL/config.h\n");
453 print("generating ../include/NTL/config_log.h\n");
454
455 GenFiles();
456 CopyFiles();
457 exit 0;
458
459
460
249249
250250
251251
252 // #define NTL_BRC_TEST
253 // Flag to test the cost of "bit reverse copy"
252254
253255
254256 #define NTL_FFT_BIGTAB_LIMIT (200)
257 #ifndef NTL_BRC_TEST
255258 #define NTL_FFT_BIGTAB_MAXROOT (17)
259 #else
260 #define NTL_FFT_BIGTAB_MAXROOT (25)
261 #endif
256262 // big tables are only used for the first NTL_FFT_BIGTAB_LIMIT primes,
257263 // and then only for k-values at most NTL_FFT_BIGTAB_MAXROOT
258264
538544 }
539545
540546
541
542 NTL_THREAD_LOCAL static
543 Vec<long> brc_mem[NTL_FFTMaxRoot+1];
544547 // FIXME: This could potentially be shared across threads, using
545548 // a "lazy table".
549 static inline
550 Vec<long> *get_brc_mem()
551 {
552 NTL_TLS_LOCAL_INIT(Vec< Vec<long> >, brc_mem_vec, (INIT_SIZE, NTL_FFTMaxRoot+1));
553 return brc_mem_vec.elts();
554 }
555
546556
547557
548558 #if 0
551561 static
552562 void BitReverseCopy(long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
553563 {
564 Vec<long> *brc_mem = get_brc_mem();
565
554566 long n = 1L << k;
555567 long* NTL_RESTRICT rev;
556568 long i, j;
571583 static
572584 void BitReverseCopy(unsigned long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
573585 {
586 Vec<long> *brc_mem = get_brc_mem();
587
574588 long n = 1L << k;
575589 long* NTL_RESTRICT rev;
576590 long i, j;
602616 static
603617 long *BRC_init(long k)
604618 {
619 Vec<long> *brc_mem = get_brc_mem();
620
605621 long n = (1L << k);
606622 brc_mem[k].SetLength(n);
607623 long *rev = brc_mem[k].elts();
616632 void BasicBitReverseCopy(long * NTL_RESTRICT B,
617633 const long * NTL_RESTRICT A, long k)
618634 {
635 Vec<long> *brc_mem = get_brc_mem();
636
619637 long n = 1L << k;
620638 long* NTL_RESTRICT rev;
621639 long i, j;
632650 static
633651 void COBRA(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
634652 {
635 NTL_THREAD_LOCAL static Vec<long> BRC_temp;
653 Vec<long> *brc_mem = get_brc_mem();
654
655 NTL_TLS_LOCAL(Vec<long>, BRC_temp);
636656
637657 long q = NTL_BRC_Q;
638658 long k1 = k - 2*q;
683703 void BasicBitReverseCopy(unsigned long * NTL_RESTRICT B,
684704 const long * NTL_RESTRICT A, long k)
685705 {
706 Vec<long> *brc_mem = get_brc_mem();
707
686708 long n = 1L << k;
687709 long* NTL_RESTRICT rev;
688710 long i, j;
699721 static
700722 void COBRA(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
701723 {
702 NTL_THREAD_LOCAL static Vec<unsigned long> BRC_temp;
724 Vec<long> *brc_mem = get_brc_mem();
725
726 NTL_TLS_LOCAL(Vec<unsigned long>, BRC_temp);
703727
704728 long q = NTL_BRC_Q;
705729 long k1 = k - 2*q;
807831
808832 // assume k > 1
809833
810 NTL_THREAD_LOCAL static Vec<long> wtab_store;
811 NTL_THREAD_LOCAL static Vec<mulmod_precon_t> wqinvtab_store;
812 NTL_THREAD_LOCAL static Vec<long> AA_store;
834 NTL_TLS_LOCAL(Vec<long>, wtab_store);
835 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
836 NTL_TLS_LOCAL(Vec<long>, AA_store);
813837
814838 wtab_store.SetLength(1L << (k-2));
815839 wqinvtab_store.SetLength(1L << (k-2));
10411065 static inline unsigned long
10421066 sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
10431067 {
1044 NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << (NTL_SP_NBITS+2);
1045 unsigned long H = ((U << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2)) >> NTL_BITS_PER_LONG);
1068 unsigned long H = cast_unsigned(b);
10461069 unsigned long Q = MulHiUL(H << 4, ninv);
1047 unsigned long L = U;
1070 unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
10481071 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
10491072
10501073 r = sp_CorrectExcessQuo(Q, r, n);
10551078 static inline unsigned long
10561079 sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
10571080 {
1058 NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << (NTL_SP_NBITS+2);
1059 unsigned long H = ((U << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2)) >> NTL_BITS_PER_LONG);
1081 unsigned long H = cast_unsigned(b);
10601082 unsigned long Q = MulHiUL(H << 4, ninv);
1061 unsigned long L = U;
1083 unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
10621084 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
10631085
10641086 Q += 1L + sp_SignMask(r-n);
10651087 return Q; // NOTE: not shifted
10661088 }
1089
10671090
10681091 #else
10691092
12571280
12581281
12591282
1283 // FFT: Lazy, no tables
12601284
12611285 void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
12621286
12831307
12841308 // assume k >= 2
12851309
1286 NTL_THREAD_LOCAL static Vec<unsigned long> AA_store;
1310 NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
12871311 AA_store.SetLength(1L << k);
12881312 unsigned long *AA = AA_store.elts();
12891313
1290 NTL_THREAD_LOCAL static Vec<long> wtab_store;
1314 NTL_TLS_LOCAL(Vec<long>, wtab_store);
12911315 wtab_store.SetLength(max(2, 1L << (k-2)));
12921316 // allocate space for at least 2 elements, to deal with a corner case when k == 2
12931317 long * NTL_RESTRICT wtab = wtab_store.elts();
12941318
1295 NTL_THREAD_LOCAL static Vec<mulmod_precon_t> wqinvtab_store;
1319 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
12961320 wqinvtab_store.SetLength(max(2, 1L << (k-2)));
12971321 // allocate space for at least 2 elements, to deal with a corner case when k == 2
12981322 mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();
16831707 }
16841708
16851709
1710 // FFT: no lazy, table
16861711
16871712 void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
16881713 // performs a 2^k-point convolution modulo q
17181743
17191744 if (k >= tab.length()) PrecompFFTMultipliers(k, q, qinv, root, tab);
17201745
1721 NTL_THREAD_LOCAL static Vec<long> AA_store;
1746 NTL_TLS_LOCAL(Vec<long>, AA_store);
17221747 AA_store.SetLength(1L << k);
17231748 long *AA = AA_store.elts();
17241749
20232048
20242049
20252050
2026
2051 #ifdef NTL_BRC_TEST
2052 bool BRC_test_flag = false;
2053 #endif
2054
2055
2056 // FFT: lazy, tables
20272057
20282058 void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
20292059
20582088
20592089 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
20602090
2061 NTL_THREAD_LOCAL static Vec<unsigned long> AA_store;
2091 NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
20622092 AA_store.SetLength(1L << k);
20632093 unsigned long *AA = AA_store.elts();
20642094
20652095
2066
2096 long n = 1L << k;
2097
2098 #ifndef NTL_BRC_TEST
20672099 BitReverseCopy(AA, a, k);
2068
2069 long n = 1L << k;
2100 #else
2101 if (BRC_test_flag)
2102 for (long i = 0; i < n; i++) AA[i] = a[i];
2103 else
2104 BitReverseCopy(AA, a, k);
2105 #endif
2106
20702107
20712108
20722109 /* we work with redundant representations, in the range [0, 4q) */
44 #include <NTL/new.h>
55
66 NTL_START_IMPL
7
8 NTL_TLS_GLOBAL_DECL(SmartPtr<GF2EInfoT>, GF2EInfo_stg)
9
10 NTL_CHEAP_THREAD_LOCAL
11 GF2EInfoT *GF2EInfo = 0;
712
813
914 GF2EInfoT::GF2EInfoT(const GF2X& NewP)
7984
8085
8186
82 NTL_THREAD_LOCAL
83 SmartPtr<GF2EInfoT> GF2EInfo = 0;
84
8587
8688
8789
9496
9597 void GF2EContext::save()
9698 {
97 ptr = GF2EInfo;
99 NTL_TLS_GLOBAL_ACCESS(GF2EInfo_stg);
100 ptr = GF2EInfo_stg;
98101 }
99102
100103 void GF2EContext::restore() const
101104 {
102 GF2EInfo = ptr;
105 NTL_TLS_GLOBAL_ACCESS(GF2EInfo_stg);
106 GF2EInfo_stg = ptr;
107 GF2EInfo = GF2EInfo_stg.get();
103108 }
104109
105110
126131
127132 const GF2E& GF2E::zero()
128133 {
129 NTL_THREAD_LOCAL static GF2E z(INIT_NO_ALLOC);
134 static const GF2E z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
130135 return z;
131136 }
132137
1111
1212 const GF2EX& GF2EX::zero()
1313 {
14 NTL_THREAD_LOCAL static GF2EX z;
14 static const GF2EX z; // GLOBAL (assumes C++11 thread-safe init)
1515 return z;
1616 }
1717
24602460
24612461
24622462
2463 NTL_THREAD_LOCAL
2463 NTL_CHEAP_THREAD_LOCAL
24642464 long GF2EXArgBound = 0;
24652465
24662466
841841 }
842842
843843
844 NTL_THREAD_LOCAL
844 NTL_CHEAP_THREAD_LOCAL
845845 long GF2EX_BlockingFactor = 10;
846846
847847 void DDF(vec_pair_GF2EX_long& factors, const GF2EX& ff, const GF2EX& hh,
16291629
16301630 /************* NEW DDF ****************/
16311631
1632 NTL_THREAD_LOCAL long GF2EX_GCDTableSize = 4;
1633 NTL_THREAD_LOCAL double GF2EXFileThresh = NTL_FILE_THRESH;
1634 NTL_THREAD_LOCAL static vec_GF2EX *BabyStepFile = 0;
1635 NTL_THREAD_LOCAL static vec_GF2EX *GiantStepFile = 0;
1636 NTL_THREAD_LOCAL static long use_files;
1632 NTL_CHEAP_THREAD_LOCAL long GF2EX_GCDTableSize = 4;
1633 NTL_CHEAP_THREAD_LOCAL double GF2EXFileThresh = NTL_FILE_THRESH;
1634 static NTL_CHEAP_THREAD_LOCAL vec_GF2EX *BabyStepFile = 0;
1635 static NTL_CHEAP_THREAD_LOCAL vec_GF2EX *GiantStepFile = 0;
1636 static NTL_CHEAP_THREAD_LOCAL long use_files;
16371637
16381638
16391639 static
3939
4040 NTL_START_IMPL
4141
42 NTL_THREAD_LOCAL
42 NTL_CHEAP_THREAD_LOCAL
4343 long GF2X::HexOutput = 0;
4444
4545
6161
6262 const GF2X& GF2X::zero()
6363 {
64 NTL_THREAD_LOCAL static GF2X z;
64 static const GF2X z; // GLOBAL (assumes C++11 thread-safe init)
6565 return z;
6666 }
6767
10401040 // finally: the general case
10411041
10421042
1043 NTL_THREAD_LOCAL static WordVector mem;
1043 NTL_TLS_LOCAL(WordVector, mem);
10441044 WordVectorWatcher watch_mem(mem);
10451045
10461046 const _ntl_ulong *ap = a.xrep.elts(), *bp = b.xrep.elts();
14061406 // finally: the general case
14071407
14081408
1409 NTL_THREAD_LOCAL static WordVector mem;
1410 NTL_THREAD_LOCAL static WordVector stk;
1411 NTL_THREAD_LOCAL static WordVector vec;
1409 NTL_TLS_LOCAL(WordVector, mem);
1410 NTL_TLS_LOCAL(WordVector, stk);
1411 NTL_TLS_LOCAL(WordVector, vec);
14121412
14131413 WordVectorWatcher watch_mem(mem);
14141414 WordVectorWatcher watch_stk(stk);
2626
2727
2828
29 #define NTL_GF2X_GCD_CROSSOVER (XOVER_SCALE*400L*NTL_BITS_PER_LONG)
29 #define NTL_GF2X_GCD_CROSSOVER (XOVER_SCALE*300L*NTL_BITS_PER_LONG)
30
3031 #define NTL_GF2X_BERMASS_CROSSOVER (XOVER_SCALE*200L*NTL_BITS_PER_LONG)
3132
32 #define NTL_GF2X_HalfGCD_CROSSOVER (6L*NTL_BITS_PER_LONG)
33 #define NTL_GF2X_HalfGCD_CROSSOVER (4L*NTL_BITS_PER_LONG)
3334
3435
3536
3738
3839
3940
40 NTL_THREAD_LOCAL
41 static vec_GF2X stab; // used by PlainDivRem and PlainRem
42
43 NTL_THREAD_LOCAL
44 static WordVector GF2X_rembuf;
41 NTL_TLS_GLOBAL_DECL(vec_GF2X, stab)
42 // used by PlainDivRem and PlainRem
43
44 NTL_TLS_GLOBAL_DECL(WordVector, GF2X_rembuf)
4545
4646
4747 void PlainDivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
4848 {
49 NTL_TLS_GLOBAL_ACCESS(stab);
50
4951 long da, sa, posa, db, sb, posb, dq, sq, posq;
5052
5153 da = deg(a);
6870 sq = dq/NTL_BITS_PER_LONG + 1;
6971 posq = dq - NTL_BITS_PER_LONG*(sq-1);
7072
73 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
7174 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
7275
7376 _ntl_ulong *ap;
164167
165168 void PlainRem(GF2X& r, const GF2X& a, const GF2X& b)
166169 {
170 NTL_TLS_GLOBAL_ACCESS(stab);
171
167172 long da, sa, posa, db, sb, posb;
168173
169174 da = deg(a);
182187 posb = db - NTL_BITS_PER_LONG*(sb-1);
183188
184189
190 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
185191 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
186192
187193 _ntl_ulong *ap;
281287 return;
282288 }
283289
284 NTL_THREAD_LOCAL static vec_long E;
290 NTL_TLS_LOCAL(vec_long, E);
285291 E.SetLength(0);
286292 append(E, e);
287293 while (e > 8) {
462468
463469 GF2X f0;
464470 trunc(f0, f, n);
465 long deg_f0 = deg(f0);
466
467 if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG
468 && deg_f0 >= NTL_BITS_PER_LONG/2) {
469 if (F.size >= 3*XOVER_SCALE)
470 F.method = GF2X_MOD_MUL;
471 else
472 F.method = GF2X_MOD_SPECIAL;
473 }
474 else if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG/2) {
475 if (F.size >= 2*XOVER_SCALE)
476 F.method = GF2X_MOD_MUL;
477 else
478 F.method = GF2X_MOD_SPECIAL;
479 }
480 else if (F.size >= 4*XOVER_SCALE)
471
472 if (F.n >= (NTL_BITS_PER_LONG/2)*XOVER_SCALE)
481473 F.method = GF2X_MOD_MUL;
482474 else
483475 F.method = GF2X_MOD_PLAIN;
476
477
478 // NOTE: I've run some tests which indicate that the GF2X_MOD_SPECIAL
479 // method is not worth it.
480 // FIXME: in a future version, I should eliminate all code
481 // and data associated with GF2X_MOD_SPECIAL
482
483 // NOTE: I've runs some tests which indicate that the crossover
484 // for GF2X_MOD_MUL is extremely low, even without PCLMUL support.
484485
485486
486487 if (F.method == GF2X_MOD_SPECIAL) {
12801281 UseMulRemX1(r, a, F);
12811282 }
12821283 else if (F.method == GF2X_MOD_SPECIAL) {
1284 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
12831285 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
12841286
12851287 long sa = a.xrep.length();
13271329 r.normalize();
13281330 }
13291331 else {
1332 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
13301333 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
13311334
13321335 long sa = a.xrep.length();
14041407 UseMulDivRemX1(q, r, a, F);
14051408 }
14061409 else if (F.method == GF2X_MOD_SPECIAL) {
1410 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
14071411 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
14081412
14091413 long sa = a.xrep.length();
14701474 r.normalize();
14711475 }
14721476 else {
1477 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
14731478 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
14741479
14751480 long sa = a.xrep.length();
15661571 UseMulDivX1(q, a, F);
15671572 }
15681573 else if (F.method == GF2X_MOD_SPECIAL) {
1574 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
15691575 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
15701576
15711577 long sa = a.xrep.length();
16181624 }
16191625 }
16201626 else {
1627 NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
16211628 WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
16221629
16231630 long sa = a.xrep.length();
19461953 }
19471954
19481955
1949 const long GF2X_DIV_CROSS = 40*XOVER_SCALE;
1956 const long GF2X_DIV_CROSS = (NTL_BITS_PER_LONG/2)*XOVER_SCALE;
19501957
19511958 void DivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
19521959 {
1953 long sa = a.xrep.length();
1954 long sb = b.xrep.length();
1955
1956 if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
1960 long da = deg(a);
1961 long db = deg(b);
1962
1963 if (db < GF2X_DIV_CROSS || da-db < GF2X_DIV_CROSS)
19571964 PlainDivRem(q, r, a, b);
1958 else if (sa < 4*sb)
1965 else if (da < 4*db)
19591966 UseMulDivRem(q, r, a, b);
19601967 else {
19611968 GF2XModulus B;
19661973
19671974 void div(GF2X& q, const GF2X& a, const GF2X& b)
19681975 {
1969 long sa = a.xrep.length();
1970 long sb = b.xrep.length();
1971
1972 if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
1976 long da = deg(a);
1977 long db = deg(b);
1978
1979 if (db < GF2X_DIV_CROSS || da-db < GF2X_DIV_CROSS)
19731980 PlainDiv(q, a, b);
1974 else if (sa < 4*sb)
1981 else if (da < 4*db)
19751982 UseMulDiv(q, a, b);
19761983 else {
19771984 GF2XModulus B;
19821989
19831990 void rem(GF2X& r, const GF2X& a, const GF2X& b)
19841991 {
1985 long sa = a.xrep.length();
1986 long sb = b.xrep.length();
1987
1988 if (sb < GF2X_DIV_CROSS || sa-sb < GF2X_DIV_CROSS)
1992 long da = deg(a);
1993 long db = deg(b);
1994
1995 if (db < GF2X_DIV_CROSS || da-db < GF2X_DIV_CROSS)
19891996 PlainRem(r, a, b);
1990 else if (sa < 4*sb)
1997 else if (da < 4*db)
19911998 UseMulRem(r, a, b);
19921999 else {
19932000 GF2XModulus B;
5252 printf("NTL_GF2X_NOINLINE ");
5353 #endif
5454
55 #ifdef NTL_PCLMUL
56 printf("NTL_PCLMUL ");
57 #endif
58
5559
5660 printf("\n");
5761
6165 {
6266 long n, i, j, iter, s, k;
6367 double t;
68
69 SetSeed(ZZ(0));
6470
6571
6672 for (i = 0; i < 10000; i++) {
108114
109115 iter = iter/2;
110116
111 iter = long((2/t)*iter) + 1;
117 iter = long((3/t)*iter) + 1;
112118
113119 double tvec[5];
114120 long w;
562562 CheckFinite(&p[i]);
563563 }
564564
565 NTL_THREAD_LOCAL static double red_fudge = 0;
566 NTL_THREAD_LOCAL static long log_red = 0;
567 NTL_THREAD_LOCAL static long verbose = 0;
568 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
569 NTL_THREAD_LOCAL static double StartTime = 0;
570 NTL_THREAD_LOCAL static double LastTime = 0;
565 static NTL_CHEAP_THREAD_LOCAL double red_fudge = 0;
566 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
567 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
568 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
569 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
570 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
571571
572572
573573
973973
974974
975975
976 NTL_THREAD_LOCAL static vec_double G_BKZConstant;
976 NTL_TLS_GLOBAL_DECL(vec_double, G_BKZConstant)
977977
978978 static
979979 void ComputeG_BKZConstant(long beta, long p)
980980 {
981 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
982
981983 const double c_PI = 3.14159265358979323846264338328;
982984 const double LogPI = 1.14472988584940017414342735135;
983985
10281030 }
10291031 }
10301032
1031 NTL_THREAD_LOCAL static vec_double G_BKZThresh;
1033 NTL_TLS_GLOBAL_DECL(vec_double, G_BKZThresh)
10321034
10331035 static
10341036 void ComputeG_BKZThresh(double *c, long beta)
10351037 {
1038 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
1039 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
1040
10361041 G_BKZThresh.SetLength(beta-1);
10371042
10381043 long i;
11071112 long G_BKZ_FP(mat_ZZ& BB, mat_ZZ* UU, double delta,
11081113 long beta, long prune, LLLCheckFct check)
11091114 {
1115 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
1116
11101117
11111118
11121119
575575 CheckFinite(&p[i]);
576576 }
577577
578 NTL_THREAD_LOCAL static quad_float red_fudge = to_quad_float(0);
579 NTL_THREAD_LOCAL static long log_red = 0;
580 NTL_THREAD_LOCAL static long verbose = 0;
581 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
582 NTL_THREAD_LOCAL static double StartTime = 0;
583 NTL_THREAD_LOCAL static double LastTime = 0;
578 NTL_TLS_GLOBAL_DECL_INIT(quad_float, red_fudge, (to_quad_float(0)))
579
580 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
581 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
582 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
583 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
584 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
584585
585586
586587
629630
630631 static void init_red_fudge()
631632 {
633 NTL_TLS_GLOBAL_ACCESS(red_fudge);
634
632635 long i;
633636
634637 // initial log_red should be <= NTL_DOUBLE_PRECISION-2,
644647
645648 static void inc_red_fudge()
646649 {
650 NTL_TLS_GLOBAL_ACCESS(red_fudge);
651
647652
648653 red_fudge = red_fudge * 2;
649654 log_red--;
661666 quad_float **aux,
662667 long m, long init_k, long &quit, GivensCache_QP& cache)
663668 {
669 NTL_TLS_GLOBAL_ACCESS(red_fudge);
670
664671 long n = B.NumCols();
665672
666673 long i, j, k, Fc1;
965972
966973
967974
968 NTL_THREAD_LOCAL static vec_quad_float G_BKZConstant;
975 NTL_TLS_GLOBAL_DECL(vec_quad_float, G_BKZConstant)
969976
970977 static
971978 void ComputeG_BKZConstant(long beta, long p)
972979 {
980 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
981
973982 const quad_float c_PI =
974983 to_quad_float("3.141592653589793238462643383279502884197");
975984 const quad_float LogPI =
10221031 }
10231032 }
10241033
1025 NTL_THREAD_LOCAL static vec_quad_float G_BKZThresh;
1034 NTL_TLS_GLOBAL_DECL(vec_quad_float, G_BKZThresh)
10261035
10271036 static
10281037 void ComputeG_BKZThresh(quad_float *c, long beta)
1029 {
1038 {
1039 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
1040 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
1041
10301042 G_BKZThresh.SetLength(beta-1);
10311043
10321044 long i;
11011113 long G_BKZ_QP(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
11021114 long beta, long prune, LLLCheckFct check)
11031115 {
1116 NTL_TLS_GLOBAL_ACCESS(red_fudge);
1117 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
1118
1119
11041120
11051121 long m = BB.NumRows();
11061122 long n = BB.NumCols();
15691585 long G_BKZ_QP1(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
15701586 long beta, long prune, LLLCheckFct check)
15711587 {
1588 NTL_TLS_GLOBAL_ACCESS(red_fudge);
1589 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
1590
1591
15721592 long m = BB.NumRows();
15731593 long n = BB.NumCols();
15741594 long m_orig = m;
380380
381381 }
382382
383 NTL_THREAD_LOCAL static RR red_fudge;
384 NTL_THREAD_LOCAL static long log_red = 0;
383 NTL_TLS_GLOBAL_DECL(RR, red_fudge)
384
385 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
385386
386387 static void init_red_fudge()
387388 {
389 NTL_TLS_GLOBAL_ACCESS(red_fudge);
390
388391 log_red = long(0.50*RR::precision());
389392
390393 power2(red_fudge, -log_red);
392395
393396 static void inc_red_fudge()
394397 {
398 NTL_TLS_GLOBAL_ACCESS(red_fudge);
399
395400
396401 mul(red_fudge, red_fudge, 2);
397402 log_red--;
405410
406411
407412
408 NTL_THREAD_LOCAL static long verbose = 0;
409 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
410 NTL_THREAD_LOCAL static double StartTime = 0;
411 NTL_THREAD_LOCAL static double LastTime = 0;
413 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
414 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
415 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
416 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
412417
413418
414419
462467 mat_RR& aux, long m, long init_k, long &quit,
463468 GivensCache_RR& cache)
464469 {
470 NTL_TLS_GLOBAL_ACCESS(red_fudge);
471
465472 long n = B.NumCols();
466473
467474 long i, j, k, Fc1;
745752
746753
747754
748 NTL_THREAD_LOCAL static vec_RR G_BKZConstant;
755 NTL_TLS_GLOBAL_DECL(vec_RR, G_BKZConstant)
749756
750757 static
751758 void ComputeG_BKZConstant(long beta, long p)
752759 {
760 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
761
753762 RR c_PI;
754763 ComputePi(c_PI);
755764
799808
800809 }
801810
802 NTL_THREAD_LOCAL static vec_RR G_BKZThresh;
811 NTL_TLS_GLOBAL_DECL(vec_RR, G_BKZThresh)
803812
804813 static
805814 void ComputeG_BKZThresh(RR *c, long beta)
806815 {
816 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
817 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
818
807819 G_BKZThresh.SetLength(beta-1);
808820
809821 long i;
885897 long G_BKZ_RR(mat_ZZ& BB, mat_ZZ* UU, const RR& delta,
886898 long beta, long prune, LLLCheckFct check)
887899 {
900 NTL_TLS_GLOBAL_ACCESS(red_fudge);
901 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
902
903
888904 long m = BB.NumRows();
889905 long n = BB.NumCols();
890906 long m_orig = m;
348348 if (k > n) p[k] = 0;
349349 }
350350
351 NTL_THREAD_LOCAL static xdouble red_fudge = to_xdouble(0);
352 NTL_THREAD_LOCAL static long log_red = 0;
351 NTL_TLS_GLOBAL_DECL_INIT(xdouble, red_fudge, (to_xdouble(0)))
352
353 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
353354
354355 static void init_red_fudge()
355356 {
357 NTL_TLS_GLOBAL_ACCESS(red_fudge);
358
356359 long i;
357360
358361 log_red = long(0.50*NTL_DOUBLE_PRECISION);
364367
365368 static void inc_red_fudge()
366369 {
370 NTL_TLS_GLOBAL_ACCESS(red_fudge);
371
367372
368373 red_fudge = red_fudge * 2;
369374 log_red--;
376381
377382
378383
379 NTL_THREAD_LOCAL static long verbose = 0;
380 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
381 NTL_THREAD_LOCAL static double StartTime = 0;
382 NTL_THREAD_LOCAL static double LastTime = 0;
384 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
385 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
386 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
387 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
383388
384389
385390
432437 xdouble **aux,
433438 long m, long init_k, long &quit, GivensCache_XD& cache)
434439 {
440 NTL_TLS_GLOBAL_ACCESS(red_fudge);
441
435442 long n = B.NumCols();
436443
437444 long i, j, k, Fc1;
713720
714721
715722
716 NTL_THREAD_LOCAL static vec_xdouble G_BKZConstant;
723 NTL_TLS_GLOBAL_DECL(vec_xdouble, G_BKZConstant)
717724
718725 static
719726 void ComputeG_BKZConstant(long beta, long p)
720727 {
728 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
729
721730 const double c_PI = 3.14159265358979323846264338328;
722731 const double LogPI = 1.14472988584940017414342735135;
723732
768777 }
769778 }
770779
771 NTL_THREAD_LOCAL static vec_xdouble G_BKZThresh;
780 NTL_TLS_GLOBAL_DECL(vec_xdouble, G_BKZThresh)
772781
773782 static
774783 void ComputeG_BKZThresh(xdouble *c, long beta)
775784 {
785 NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
786 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
787
776788 G_BKZThresh.SetLength(beta-1);
777789
778790 long i;
846858 long G_BKZ_XD(mat_ZZ& BB, mat_ZZ* UU, xdouble delta,
847859 long beta, long prune, LLLCheckFct check)
848860 {
861 NTL_TLS_GLOBAL_ACCESS(red_fudge);
862 NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
863
864
849865 long m = BB.NumRows();
850866 long n = BB.NumCols();
851867 long m_orig = m;
0
1 /*
2 * Author: David Robert Nadeau
3 * Site: http://NadeauSoftware.com/
4 * License: Creative Commons Attribution 3.0 Unported License
5 * http://creativecommons.org/licenses/by/3.0/deed.en_US
6 */
7
8
9 // NTL NOTES: I've adapted this code from the above source.
10 // The reason is that for some multithreaded benchmarking, I want
11 // to use wall clock time, and this seemed like the best multiplatform
12 // solution to getting a high-resolution wall clock timer.
13 // The only change I made to the original code is to initialize
14 // timeConvert for the OSX case using a thread-safe initialization
15 // C++ idiom.
16
17
18
19 #if defined(_WIN32)
20 #include <Windows.h>
21
22 #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
23 #include <unistd.h> /* POSIX flags */
24 #include <time.h> /* clock_gettime(), time() */
25 #include <sys/time.h> /* gethrtime(), gettimeofday() */
26
27 #if defined(__MACH__) && defined(__APPLE__)
28 #include <mach/mach.h>
29 #include <mach/mach_time.h>
30
31 static inline double InitTimeConvert()
32 {
33 mach_timebase_info_data_t timeBase;
34 (void)mach_timebase_info( &timeBase );
35 return (double)timeBase.numer / (double)timeBase.denom / 1000000000.0;
36 }
37
38 #endif
39
40 #else
41 #error "Unable to define GetTime( ) for an unknown OS."
42 #endif
43
44
45
46
47
48
49 /**
50 * Returns the real time, in seconds, or -1.0 if an error occurred.
51 *
52 * Time is measured since an arbitrary and OS-dependent start time.
53 * The returned real time is only useful for computing an elapsed time
54 * between two calls to this function.
55 */
56 double _ntl_GetTime( )
57 {
58 #if defined(_WIN32)
59 FILETIME tm;
60 ULONGLONG t;
61 #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
62 /* Windows 8, Windows Server 2012 and later. ---------------- */
63 GetSystemTimePreciseAsFileTime( &tm );
64 #else
65 /* Windows 2000 and later. ---------------------------------- */
66 GetSystemTimeAsFileTime( &tm );
67 #endif
68 t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
69 return (double)t / 10000000.0;
70
71 #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
72 /* HP-UX, Solaris. ------------------------------------------ */
73 return (double)gethrtime( ) / 1000000000.0;
74
75 #elif defined(__MACH__) && defined(__APPLE__)
76 /* OSX. ----------------------------------------------------- */
77 static double timeConvert = InitTimeConvert();
78 // even in a multi-threaded environment, this will
79 // be safely initialized, according to C++11 standard
80
81 return (double)mach_absolute_time( ) * timeConvert;
82
83 #elif defined(_POSIX_VERSION)
84 /* POSIX. --------------------------------------------------- */
85 #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
86 {
87 struct timespec ts;
88 #if defined(CLOCK_MONOTONIC_PRECISE)
89 /* BSD. --------------------------------------------- */
90 const clockid_t id = CLOCK_MONOTONIC_PRECISE;
91 #elif defined(CLOCK_MONOTONIC_RAW)
92 /* Linux. ------------------------------------------- */
93 const clockid_t id = CLOCK_MONOTONIC_RAW;
94 #elif defined(CLOCK_HIGHRES)
95 /* Solaris. ----------------------------------------- */
96 const clockid_t id = CLOCK_HIGHRES;
97 #elif defined(CLOCK_MONOTONIC)
98 /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
99 const clockid_t id = CLOCK_MONOTONIC;
100 #elif defined(CLOCK_REALTIME)
101 /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
102 const clockid_t id = CLOCK_REALTIME;
103 #else
104 const clockid_t id = (clockid_t)-1; /* Unknown. */
105 #endif /* CLOCK_* */
106 if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
107 return (double)ts.tv_sec +
108 (double)ts.tv_nsec / 1000000000.0;
109 /* Fall thru. */
110 }
111 #endif /* _POSIX_TIMERS */
112
113 /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
114 struct timeval tm;
115 gettimeofday( &tm, NULL );
116 return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
117 #else
118 return -1.0; /* Failed. */
119 #endif
120 }
121
122
123
124
125
1515
1616 double _ntl_GetTime()
1717 {
18 NTL_THREAD_LOCAL static clock_t last_clock = 0;
19 NTL_THREAD_LOCAL static double acc = 0;
18 static NTL_CHEAP_THREAD_LOCAL clock_t last_clock = 0;
19 static NTL_CHEAP_THREAD_LOCAL double acc = 0;
2020
2121 clock_t this_clock;
2222 double delta;
2727 cout << "NTL_THREADS=0\n";
2828 #endif
2929
30 #ifdef NTL_DISABLE_TLS_HACK
31 cout << "NTL_DISABLE_TLS_HACK=1\n";
32 #else
33 cout << "NTL_DISABLE_TLS_HACK=0\n";
34 #endif
35
36 #ifdef NTL_ENABLE_TLS_HACK
37 cout << "NTL_ENABLE_TLS_HACK=1\n";
38 #else
39 cout << "NTL_ENABLE_TLS_HACK=0\n";
40 #endif
41
3042 #ifdef NTL_EXCEPTIONS
3143 cout << "NTL_EXCEPTIONS=1\n";
3244 #else
3345 cout << "NTL_EXCEPTIONS=0\n";
46 #endif
47
48 #ifdef NTL_THREAD_BOOST
49 cout << "NTL_THREAD_BOOST=1\n";
50 #else
51 cout << "NTL_THREAD_BOOST=0\n";
3452 #endif
3553
3654
5573 #endif
5674
5775
76 #ifdef NTL_DISABLE_LL_ASM
77 cout << "NTL_DISABLE_LL_ASM=1\n";
78 #else
79 cout << "NTL_DISABLE_LL_ASM=0\n";
80 #endif
81
82 #ifdef NTL_MAXIMIZE_SP_NBITS
83 cout << "NTL_MAXIMIZE_SP_NBITS=1\n";
84 #else
85 cout << "NTL_MAXIMIZE_SP_NBITS=0\n";
86 #endif
87
88
5889
5990 #ifdef NTL_GMP_LIP
6091 cout << "NTL_GMP_LIP=1\n";
6798 cout << "NTL_GF2X_LIB=1\n";
6899 #else
69100 cout << "NTL_GF2X_LIB=0\n";
70 #endif
71
72 #ifdef NTL_PCLMUL
73 cout << "NTL_PCLMUL=1\n";
74 #else
75 cout << "NTL_PCLMUL=0\n";
76101 #endif
77102
78103 #ifdef NTL_LONG_LONG_TYPE
425425 c[k] = b[k] - s;
426426 }
427427
428 NTL_THREAD_LOCAL static double red_fudge = 0;
429 NTL_THREAD_LOCAL static long log_red = 0;
430 NTL_THREAD_LOCAL static long verbose = 0;
431
432 NTL_THREAD_LOCAL double LLLStatusInterval = 900.0;
433 NTL_THREAD_LOCAL char *LLLDumpFile = 0;
434
435 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
436 NTL_THREAD_LOCAL static double RR_GS_time = 0;
437 NTL_THREAD_LOCAL static double StartTime = 0;
438 NTL_THREAD_LOCAL static double LastTime = 0;
428 NTL_CHEAP_THREAD_LOCAL double LLLStatusInterval = 900.0;
429 NTL_CHEAP_THREAD_LOCAL char *LLLDumpFile = 0;
430
431 static NTL_CHEAP_THREAD_LOCAL double red_fudge = 0;
432 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
433 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
434
435 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
436 static NTL_CHEAP_THREAD_LOCAL double RR_GS_time = 0;
437 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
438 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
439439
440440
441441
429429 c[k] = b[k] - s;
430430 }
431431
432 NTL_THREAD_LOCAL static quad_float red_fudge = to_quad_float(0);
433 NTL_THREAD_LOCAL static long log_red = 0;
434 NTL_THREAD_LOCAL static long verbose = 0;
435 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
436 NTL_THREAD_LOCAL static double StartTime = 0;
437 NTL_THREAD_LOCAL static double LastTime = 0;
432 NTL_TLS_GLOBAL_DECL_INIT(quad_float, red_fudge, (to_quad_float(0)))
433
434 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
435 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
436 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
437 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
438 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
438439
439440
440441 static void LLLStatus(long max_k, double t, long m, const mat_ZZ& B)
482483
483484 static void init_red_fudge()
484485 {
486 NTL_TLS_GLOBAL_ACCESS(red_fudge);
487
485488 long i;
486489
487490 // initial log_red should be <= NTL_DOUBLE_PRECISION-2,
497500
498501 static void inc_red_fudge()
499502 {
503 NTL_TLS_GLOBAL_ACCESS(red_fudge);
504
500505
501506 red_fudge = red_fudge * 2;
502507 log_red--;
514519 quad_float *b, quad_float *c,
515520 long m, long init_k, long &quit)
516521 {
522 NTL_TLS_GLOBAL_ACCESS(red_fudge);
523
517524 long n = B.NumCols();
518525
519526 long i, j, k, Fc1;
526533 quad_float *tp;
527534
528535
529 NTL_THREAD_LOCAL static double bound = 0;
536 static NTL_CHEAP_THREAD_LOCAL double bound = 0;
530537
531538
532539 if (bound == 0) {
883890
884891
885892
886 NTL_THREAD_LOCAL static vec_quad_float BKZConstant;
893 NTL_TLS_GLOBAL_DECL(vec_quad_float, BKZConstant)
887894
888895 static
889896 void ComputeBKZConstant(long beta, long p)
890897 {
898 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
899
900
891901 const quad_float c_PI =
892902 to_quad_float("3.141592653589793238462643383279502884197");
893903 const quad_float LogPI =
941951 }
942952
943953
944 NTL_THREAD_LOCAL static vec_quad_float BKZThresh;
954 NTL_TLS_GLOBAL_DECL(vec_quad_float, BKZThresh)
945955
946956 static
947957 void ComputeBKZThresh(quad_float *c, long beta)
948958 {
959 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
960 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
961
949962 BKZThresh.SetLength(beta-1);
950963
951964 long i;
10201033 long BKZ_QP(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
10211034 long beta, long prune, LLLCheckFct check)
10221035 {
1036 NTL_TLS_GLOBAL_ACCESS(red_fudge);
1037 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
1038
1039
10231040
10241041 long m = BB.NumRows();
10251042 long n = BB.NumCols();
14941511 long BKZ_QP1(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
14951512 long beta, long prune, LLLCheckFct check)
14961513 {
1514 NTL_TLS_GLOBAL_ACCESS(red_fudge);
1515 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
1516
1517
14971518
14981519 long m = BB.NumRows();
14991520 long n = BB.NumCols();
161161 sub(c(k), b(k), s);
162162 }
163163
164 NTL_THREAD_LOCAL static RR red_fudge;
165 NTL_THREAD_LOCAL static long log_red = 0;
164 NTL_TLS_GLOBAL_DECL(RR, red_fudge)
165
166 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
166167
167168 static void init_red_fudge()
168169 {
170 NTL_TLS_GLOBAL_ACCESS(red_fudge);
171
169172 log_red = long(0.50*RR::precision());
170173
171174 power2(red_fudge, -log_red);
173176
174177 static void inc_red_fudge()
175178 {
179 NTL_TLS_GLOBAL_ACCESS(red_fudge);
180
176181
177182 mul(red_fudge, red_fudge, 2);
178183 log_red--;
186191
187192
188193
189 NTL_THREAD_LOCAL static long verbose = 0;
190 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
191 NTL_THREAD_LOCAL static double StartTime = 0;
192 NTL_THREAD_LOCAL static double LastTime = 0;
194 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
195 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
196 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
197 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
193198
194199
195200
242247 LLLCheckFct check, mat_RR& B1, mat_RR& mu,
243248 vec_RR& b, vec_RR& c, long m, long init_k, long &quit)
244249 {
250 NTL_TLS_GLOBAL_ACCESS(red_fudge);
251
245252 long n = B.NumCols();
246253
247254 long i, j, k, Fc1;
587594
588595
589596
590 NTL_THREAD_LOCAL static vec_RR BKZConstant;
597 NTL_TLS_GLOBAL_DECL(vec_RR, BKZConstant)
591598
592599 static
593600 void ComputeBKZConstant(long beta, long p)
594601 {
602 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
603
595604 RR c_PI;
596605 ComputePi(c_PI);
597606
641650
642651 }
643652
644 NTL_THREAD_LOCAL static vec_RR BKZThresh;
653 NTL_TLS_GLOBAL_DECL(vec_RR, BKZThresh)
645654
646655 static
647656 void ComputeBKZThresh(RR *c, long beta)
648657 {
658 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
659 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
660
649661 BKZThresh.SetLength(beta-1);
650662
651663 long i;
727739 long BKZ_RR(mat_ZZ& BB, mat_ZZ* UU, const RR& delta,
728740 long beta, long prune, LLLCheckFct check)
729741 {
742 NTL_TLS_GLOBAL_ACCESS(red_fudge);
743 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
744
745
730746 long m = BB.NumRows();
731747 long n = BB.NumCols();
732748 long m_orig = m;
11941210
11951211 void NearVector(vec_ZZ& ww, const mat_ZZ& BB, const vec_ZZ& a)
11961212 {
1213 NTL_TLS_GLOBAL_ACCESS(red_fudge);
1214
11971215 long n = BB.NumCols();
11981216
11991217 if (n != BB.NumRows())
189189 c[k] = b[k] - s;
190190 }
191191
192 NTL_THREAD_LOCAL static xdouble red_fudge = to_xdouble(0);
193 NTL_THREAD_LOCAL static long log_red = 0;
192 NTL_TLS_GLOBAL_DECL_INIT(xdouble, red_fudge, (to_xdouble(0)))
193
194
195 static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
194196
195197 static void init_red_fudge()
196198 {
199 NTL_TLS_GLOBAL_ACCESS(red_fudge);
200
197201 long i;
198202
199203 log_red = long(0.50*NTL_DOUBLE_PRECISION);
205209
206210 static void inc_red_fudge()
207211 {
212 NTL_TLS_GLOBAL_ACCESS(red_fudge);
213
208214
209215 red_fudge = red_fudge * 2;
210216 log_red--;
217223
218224
219225
220 NTL_THREAD_LOCAL static long verbose = 0;
221 NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
222 NTL_THREAD_LOCAL static double StartTime = 0;
223 NTL_THREAD_LOCAL static double LastTime = 0;
226 static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
227 static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
228 static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
229 static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
224230
225231
226232
273279 xdouble *b, xdouble *c,
274280 long m, long init_k, long &quit)
275281 {
282 NTL_TLS_GLOBAL_ACCESS(red_fudge);
283
276284 long n = B.NumCols();
277285
278286 long i, j, k, Fc1;
284292 xdouble *tp;
285293
286294
287 NTL_THREAD_LOCAL static xdouble bound = to_xdouble(0);
295 NTL_TLS_LOCAL_INIT(xdouble, bound, (to_xdouble(0)));
288296
289297
290298 if (bound == 0) {
616624
617625
618626
619 NTL_THREAD_LOCAL static vec_xdouble BKZConstant;
627 NTL_TLS_GLOBAL_DECL(vec_xdouble, BKZConstant)
620628
621629 static
622630 void ComputeBKZConstant(long beta, long p)
623631 {
632 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
633
624634 const double c_PI = 3.14159265358979323846264338328;
625635 const double LogPI = 1.14472988584940017414342735135;
626636
671681 }
672682 }
673683
674 NTL_THREAD_LOCAL static vec_xdouble BKZThresh;
684 NTL_TLS_GLOBAL_DECL(vec_xdouble, BKZThresh)
675685
676686 static
677687 void ComputeBKZThresh(xdouble *c, long beta)
678688 {
689 NTL_TLS_GLOBAL_ACCESS(BKZConstant);
690 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
691
679692 BKZThresh.SetLength(beta-1);
680693
681694 long i;
749762 long BKZ_XD(mat_ZZ& BB, mat_ZZ* UU, xdouble delta,
750763 long beta, long prune, LLLCheckFct check)
751764 {
765 NTL_TLS_GLOBAL_ACCESS(red_fudge);
766 NTL_TLS_GLOBAL_ACCESS(BKZThresh);
767
768
752769 long m = BB.NumRows();
753770 long n = BB.NumCols();
754771 long m_orig = m;
+0
-24
src/MakeCheckCLZL less more
0
1 echo "Checking for __builtin_clzl"
2
3 cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
4
5 sh RemoveProg CheckCLZL
6 echo $1 -o CheckCLZL CheckCLZL.c CheckCLZLAux.c $2
7 $1 -o CheckCLZL CheckCLZL.c CheckCLZLAux.c $2
8
9 if test -f CheckCLZL
10 then
11 if ./CheckCLZL
12 then
13 echo "have __builtin_clzl"
14 cp ../include/NTL/have_builtin_clzl_yes.h ../include/NTL/have_builtin_clzl.h
15 sh RemoveProg CheckCLZL
16 exit 0
17 fi
18 fi
19
20 echo "DO NOT have __builtin__clzl"
21 sh RemoveProg CheckCLZL
22 exit 0
23
0
1 printf '*** Checking for feature: %s ' "$1"
2
3 echo "" > "../include/NTL/HAVE_$1.h"
4
5 sh RemoveProg CheckFeature
6 echo $3 -o CheckFeature $2 $4 >> "CheckFeature.log" 2>&1
7 $3 -o CheckFeature $2 $4 >> "CheckFeature.log" 2>&1
8
9 if test -f CheckFeature
10 then
11 if ./CheckFeature
12 then
13 echo "[yes]"
14 echo "#ifndef NTL_HAVE_$1" > "../include/NTL/HAVE_$1.h"
15 echo "#define NTL_HAVE_$1" >> "../include/NTL/HAVE_$1.h"
16 echo "#endif" >> "../include/NTL/HAVE_$1.h"
17 sh RemoveProg CheckFeature
18 exit 0
19 fi
20 fi
21
22 echo "[no]"
23 sh RemoveProg CheckFeature
24 exit 0
25
+0
-24
src/MakeCheckLL less more
0
1 echo "Checking for working LL type"
2
3 cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
4
5 sh RemoveProg CheckLL
6 echo $1 -o CheckLL CheckLL.c CheckLLAux.c $2
7 $1 -o CheckLL CheckLL.c CheckLLAux.c $2
8
9 if test -f CheckLL
10 then
11 if ./CheckLL
12 then
13 echo "have LL type"
14 cp ../include/NTL/have_LL_yes.h ../include/NTL/have_LL.h
15 sh RemoveProg CheckLL
16 exit 0
17 fi
18 fi
19
20 echo "DO NOT have LL type"
21 sh RemoveProg CheckLL
22 exit 0
23
4646 void touch_double(double* x);
4747 void touch_ldouble(long double* x);
4848
49
50
51
52 double power2(long k)
53 {
54 long i;
55 double res;
56
57 res = 1;
58
59 for (i = 1; i <= k; i++)
60 res = res * 2;
61
62 return res;
63 }
64
49 double sum_double(double *x, long n);
50
51 double fma_test(double a, double b, double c);
52
53
54 double power2(long k);
55
56
57 long FMADetected(long dp)
58 {
59 double x = power2(0) + power2(dp-1);
60 double y = power2(0) + power2(dp-1);
61
62 touch_double(&x);
63 touch_double(&y);
64
65 double z = x*y;
66 touch_double(&z);
67 z = -z;
68 touch_double(&z);
69
70 double lo = fma_test(x, y, z);
71 return lo != 0;
72 }
6573
6674 long DoubleRounding(long dp)
6775 {
6876 double a = power2(dp-1) + 1;
6977 double b = (power2(dp)-1)/power2(dp+1);
70 register double x = a + b;
71 double y = x;
72
73 touch_double(&y);
74
75 if (y != power2(dp-1) + 1)
78
79 double vec[2];
80 vec[0] = a;
81 vec[1] = b;
82
83 double sum = sum_double(vec, 2);
84
85 touch_double(&sum);
86
87 if (sum != a)
7688 return 1;
7789 else
7890 return 0;
97109 eps *= 1.0/2.0;
98110 tmp = 1.0 + eps;
99111 touch_double(&tmp);
100 res = tmp - one;
101 } while (res == eps);
102
103 return k;
104 }
105
106 long DoublePrecision1()
107 {
108 double eps, one, res;
109 long k;
110
111 one = val_double(1.0);
112 eps = val_double(1.0);
113
114 k = 0;
115
116 do {
117 register double tmp;
118
119 k++;
120 eps *= 1.0/2.0;
121 tmp = 1.0 + eps;
122112 res = tmp - one;
123113 } while (res == eps);
124114
763753 int main()
764754 {
765755 long bpl, bpi, bpt, rs_arith, nbits, wnbits;
766 long dp, dp1, dr;
756 long dp, dr;
757 long fma_detected;
767758 long ldp;
768759 FILE *f;
769760 long warnings = 0;
839830 }
840831
841832 /*
842 * check that ints are bigger than chars.
843 */
844
845 if (bpi <= CHAR_BIT) {
846 fprintf(stderr, "BAD NEWS: int type must be longer than char type.\n");
847 return 1;
848 }
849
833 * check that there are 8 bits in a char. This is a POSIX requirement.
834 */
835
836 if (CHAR_BIT != 8) {
837 fprintf(stderr, "BAD NEWS: char type must have 8 bits.\n");
838 return 1;
839 }
840
841
842 /*
843 * check that bpi is a multiple of 8.
844 */
845
846 if (bpi % 8 != 0) {
847 fprintf(stderr, "BAD NEWS: int type must be multiple of 8 bits.\n");
848 return 1;
849 }
850850
851851
852852 /*
854854 */
855855
856856 if (bpl % 8 != 0) {
857 fprintf(stderr, "BAD NEWS: word size must be multiple of 8 bits.\n");
858 return 1;
859 }
860
861
857 fprintf(stderr, "BAD NEWS: long type must be multiple of 8 bits.\n");
858 return 1;
859 }
862860
863861
864862 /*
995993 * This test almost always yields the correct result --- if not,
996994 * you will have to set the NTL_EXT_DOUBLE in "mach_desc.h"
997995 * by hand.
998 *
999 * The test effectively proves that in-register doubles are wide
1000 * if dp1 > dp || dr.
1001 */
1002
1003
1004 dp1 = DoublePrecision1();
996 */
997
998
1005999 dr = DoubleRounding(dp);
1000
1001
1002 /*
1003 * Next, we check if the platform uses FMA (fused multiply add),
1004 * even across statement boundaries.
1005 */
1006
1007 fma_detected = FMADetected(dp);
1008
10061009
10071010
10081011 /*
10691072 fprintf(stderr, "long double precision = %ld\n", ldp);
10701073 fprintf(stderr, "NBITS (maximum) = %ld\n", nbits);
10711074 fprintf(stderr, "WNBITS (maximum) = %ld\n", wnbits);
1072 fprintf(stderr, "register double precision = %ld\n", dp1);
10731075 fprintf(stderr, "double rounding detected = %s\n", yn_vec[dr]);
1074
1075 if (((dp1 > dp) || dr) && GNUC_INTEL)
1076 fprintf(stderr, "FMA detected = %s\n", yn_vec[fma_detected]);
1077
1078 if (dr && GNUC_INTEL)
10761079 fprintf(stderr, "-- auto x86 fix\n");
10771080
10781081 if (dp != 53) {
10961099
10971100 #endif
10981101
1099 if (((dp1 > dp) || dr) && !GNUC_INTEL) {
1102 if (dr && !GNUC_INTEL) {
11001103 warnings = 1;
11011104 fprintf(stderr, "\n\nWARNING:\n\n");
11021105 fprintf(stderr, "This platform has extended double precision registers.\n");
11711174 fprintf(f, "#define NTL_QUAD_FLOAT_SPLIT (");
11721175 print2k(f, dp - (dp/2), bpl);
11731176 fprintf(f, "+1.0)\n");
1174 fprintf(f, "#define NTL_EXT_DOUBLE (%d)\n", ((dp1 > dp) || dr));
1177 fprintf(f, "#define NTL_EXT_DOUBLE (%ld)\n", dr);
1178
1179 fprintf(f, "#define NTL_FMA_DETECTED (%ld)\n", fma_detected);
1180
1181
1182
11751183 print_BB_mul_code(f, bpl);
11761184 print_BB_sqr_code(f, bpl);
11771185 print_BB_rev_code(f, bpl);
2323 void touch_double(double* x) {}
2424 void touch_ldouble(long double* x) {}
2525
26 double sum_double(double *x, long n)
27 {
28 long i;
29 double acc = 0;
30
31 for (i = 0; i < n; i++)
32 acc += x[i];
33
34 return acc;
35 }
36
37 double fma_test(double a, double b, double c)
38 {
39 double t1 = a*b;
40 double t2 = t1 + c;
41 return t2;
42 }
43
44 double power2(long k)
45 {
46 long i;
47 double res;
48
49 res = 1;
50
51 for (i = 1; i <= k; i++)
52 res = res * 2;
53
54 return res;
55 }
77
88
99 sh RemoveProg TestGetPID
10 echo $1 -o TestGetPID TestGetPID.c GetPID1.c $2
11 $1 -o TestGetPID TestGetPID.c GetPID1.c $2
10 echo $1 -o TestGetPID TestGetPID.c GetPID1.c $2 >> "CheckFeature.log" 2>&1
11 $1 -o TestGetPID TestGetPID.c GetPID1.c $2 >> "CheckFeature.log" 2>&1
1212
1313 if test -f TestGetPID
1414 then
77
88
99 sh RemoveProg TestGetTime
10 echo $1 -o TestGetTime TestGetTime.c GetTime1.c $2
11 $1 -o TestGetTime TestGetTime.c GetTime1.c $2
10 echo $1 -o TestGetTime TestGetTime.c GetTime1.c $2 >> "CheckFeature.log" 2>&1
11 $1 -o TestGetTime TestGetTime.c GetTime1.c $2 >> "CheckFeature.log" 2>&1
1212
1313 if test -f TestGetTime
1414 then
15 if ./TestGetTime 1 1048576 1048575
15 if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
1616 then
1717 cp GetTime1.c GetTime.c
1818 echo "using GetTime1.c"
2222
2323
2424 sh RemoveProg TestGetTime
25 echo $1 -o TestGetTime TestGetTime.c GetTime2.c $2
26 $1 -o TestGetTime TestGetTime.c GetTime2.c $2
25 echo $1 -o TestGetTime TestGetTime.c GetTime2.c $2 >> "CheckFeature.log" 2>&1
26 $1 -o TestGetTime TestGetTime.c GetTime2.c $2 >> "CheckFeature.log" 2>&1
2727
2828 if test -f TestGetTime
2929 then
30 if ./TestGetTime 1 1048576 1048575
30 if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
3131 then
3232 cp GetTime2.c GetTime.c
3333 echo "using GetTime2.c"
3636 fi
3737
3838 sh RemoveProg TestGetTime
39 echo $1 -o TestGetTime TestGetTime.c GetTime3.c $2
40 $1 -o TestGetTime TestGetTime.c GetTime3.c $2
39 echo $1 -o TestGetTime TestGetTime.c GetTime3.c $2 >> "CheckFeature.log" 2>&1
40 $1 -o TestGetTime TestGetTime.c GetTime3.c $2 >> "CheckFeature.log" 2>&1
4141
4242 if test -f TestGetTime
4343 then
44 if ./TestGetTime 1 1048576 1048575
44 if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
4545 then
4646 cp GetTime3.c GetTime.c
4747 echo "using GetTime3.c"
5050 fi
5151
5252 sh RemoveProg TestGetTime
53 echo $1 -o TestGetTime TestGetTime.c GetTime4.c $2
54 $1 -o TestGetTime TestGetTime.c GetTime4.c $2
53 echo $1 -o TestGetTime TestGetTime.c GetTime4.c $2 >> "CheckFeature.log" 2>&1
54 $1 -o TestGetTime TestGetTime.c GetTime4.c $2 >> "CheckFeature.log" 2>&1
5555
5656
5757 if test -f TestGetTime
5858 then
59 if ./TestGetTime 1 1048576 1048575
59 if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
6060 then
6161 cp GetTime4.c GetTime.c
6262 echo "using GetTime4.c"
6565 fi
6666
6767 sh RemoveProg TestGetTime
68 echo $1 -o TestGetTime TestGetTime.c GetTime5.c $2
69 $1 -o TestGetTime TestGetTime.c GetTime5.c $2
68 echo $1 -o TestGetTime TestGetTime.c GetTime5.c $2 >> "CheckFeature.log" 2>&1
69 $1 -o TestGetTime TestGetTime.c GetTime5.c $2 >> "CheckFeature.log" 2>&1
7070
7171
7272 if test -f TestGetTime
6161 }
6262
6363 #endif
64
65 SetSeed(ZZ(0));
6466
6567 long i, k;
6668
127129
128130
129131 iter = iter/2;
130 iter = long((2/t)*iter) + 1;
132 iter = long((3/t)*iter) + 1;
131133
132134 double tvec[5];
133135 long w;
00
1 ============================
21
32 FIXME: maybe it would make more sense to take the +1/-1 logic
43 out of [cg]_lip_impl block_construct routines and just put it in
1716 * add template functions clear(), to clear multiple
1817 entries in a Vec or Poly. The important thing is
1918 to provide specialized ones for Vec<GF2> and GF2X.
20 * RandomBnd_long is too slow...it is not a great idea
21 to work with ZZ's for large n...better strategy would be
22 simple "expected 2 iterations" strategy.
2319
2420
2521
4238 make sure these changes are implemented in the template files
4339 mfile and cfile, and then run:
4440
45 ./configure --nowrite
46 cp mfileout def_makefile
47 cp cfileout ../include/NTL/def_config.h
48
49 - run:
50
51 NOTE: try executing
5241 export COPYFILE_DISABLE=1
53 beforehand
54
55
5642 make ppdoc
5743 make ppclean
5844 make package
7575
7676 #endif
7777
78 SetSeed(ZZ(0));
79
7880
7981 long n, k;
8082
8486 ZZ p;
8587
8688 RandomLen(p, k);
89 if (!IsOdd(p)) p++;
8790
8891
8992 ZZ_p::init(p); // initialization
135138
136139 for (r = 0; r < nprimes; r++) UseFFTPrime(r);
137140
138 vec_long aa[nprimes], AA[nprimes];
141 vec_long A1[nprimes], A2[nprimes];
142 vec_long B1[nprimes], B2[nprimes];
139143
140144 for (r = 0; r < nprimes; r++) {
141 aa[r].SetLength(N);
142 AA[r].SetLength(N);
143
144 for (i = 0; i < N; i++)
145 aa[r][i] = RandomBnd(GetFFTPrime(r));
146
147
148 FFTFwd(AA[r].elts(), aa[r].elts(), L, r);
149 FFTRev1(AA[r].elts(), AA[r].elts(), L, r);
145 A1[r].SetLength(N);
146 A2[r].SetLength(N);
147 B1[r].SetLength(N);
148 B2[r].SetLength(N);
149
150 for (i = 0; i < N; i++) {
151 A1[r][i] = RandomBnd(GetFFTPrime(r));
152 A2[r][i] = RandomBnd(GetFFTPrime(r));
153 }
154 }
155
156 for (r = 0; r < nprimes; r++) {
157 long *A1p = A1[r].elts();
158 long *A2p = A2[r].elts();
159 long *B1p = B1[r].elts();
160 long *B2p = B2[r].elts();
161 long q = GetFFTPrime(r);
162 mulmod_t qinv = GetFFTPrimeInv(r);
163
164 FFTFwd(B1p, A1p, L, r);
165 FFTFwd(B2p, A2p, L, r);
166 for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
167 FFTRev1(B1p, B1p, L, r);
150168 }
151169
152170 iter = 1;
155173 t = GetTime();
156174 for (j = 0; j < iter; j++) {
157175 for (r = 0; r < nprimes; r++) {
158 long *AAp = AA[r].elts();
159 long *aap = aa[r].elts();
176 long *A1p = A1[r].elts();
177 long *A2p = A2[r].elts();
178 long *B1p = B1[r].elts();
179 long *B2p = B2[r].elts();
160180 long q = GetFFTPrime(r);
161181 mulmod_t qinv = GetFFTPrimeInv(r);
162182
163 FFTFwd(AAp, aap, L, r);
164 FFTRev1(AAp, aap, L, r);
165 for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv);
183 FFTFwd(B1p, A1p, L, r);
184 FFTFwd(B2p, A2p, L, r);
185 for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
186 FFTRev1(B1p, B1p, L, r);
166187 }
167188 }
168189 t = GetTime() - t;
171192
172193 iter = iter/2;
173194
174 iter = long((1.5/t)*iter) + 1;
195 iter = long((3/t)*iter) + 1;
175196
176197
177198 double tvec[5];
181202 t = GetTime();
182203 for (j = 0; j < iter; j++) {
183204 for (r = 0; r < nprimes; r++) {
184 long *AAp = AA[r].elts();
185 long *aap = aa[r].elts();
205 long *A1p = A1[r].elts();
206 long *A2p = A2[r].elts();
207 long *B1p = B1[r].elts();
208 long *B2p = B2[r].elts();
186209 long q = GetFFTPrime(r);
187210 mulmod_t qinv = GetFFTPrimeInv(r);
188211
189 FFTFwd(AAp, aap, L, r);
190 FFTRev1(AAp, aap, L, r);
191 for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv);
212 FFTFwd(B1p, A1p, L, r);
213 FFTFwd(B2p, A2p, L, r);
214 for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
215 FFTRev1(B1p, B1p, L, r);
192216 }
193217 }
194218 t = GetTime() - t;
0
1 #include <NTL/ZZ_pX.h>
2
3 #include <cstdio>
4
5 NTL_CLIENT
6
7
8 double clean_data(double *t)
9 {
10 double x, y, z;
11 long i, ix, iy, n;
12
13 x = t[0]; ix = 0;
14 y = t[0]; iy = 0;
15
16 for (i = 1; i < 5; i++) {
17 if (t[i] < x) {
18 x = t[i];
19 ix = i;
20 }
21 if (t[i] > y) {
22 y = t[i];
23 iy = i;
24 }
25 }
26
27 z = 0; n = 0;
28 for (i = 0; i < 5; i++) {
29 if (i != ix && i != iy) z+= t[i], n++;
30 }
31
32 z = z/n;
33
34 return z;
35 }
36
37 void print_flag()
38 {
39
40
41 #if (defined(NTL_TBL_REM))
42 printf("TBL_REM ");
43 #elif (defined(NTL_TBL_REM_LL))
44 printf("TBL_REM_LL ");
45 #else
46 printf("DEFAULT ");
47 #endif
48
49
50 printf("\n");
51
52 }
53
54
55 int main()
56 {
57
58 #if (defined(NTL_TBL_REM) && defined(NTL_GMP_LIP) && !(defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG))
59 {
60 printf("999999999999999 ");
61 print_flag();
62 return 0;
63 }
64
65 #endif
66
67 #if (defined(NTL_TBL_REM_LL) && !defined(NTL_GMP_LIP) && !defined(NTL_HAVE_LL_TYPE))
68 {
69 printf("999999999999999 ");
70 print_flag();
71 return 0;
72 }
73
74 #endif
75
76 SetSeed(ZZ(0));
77
78 long n, k;
79
80 n = 200;
81 k = 10*NTL_ZZ_NBITS;
82
83 ZZ p;
84
85 RandomLen(p, k);
86 if (!IsOdd(p)) p++;
87
88
89 ZZ_p::init(p); // initialization
90
91 ZZ_pX f, g, h, r1, r2, r3;
92
93 random(g, n); // g = random polynomial of degree < n
94 random(h, n); // h = " "
95 random(f, n); // f = " "
96
97 SetCoeff(f, n); // Sets coefficient of X^n to 1
98
99 // For doing arithmetic mod f quickly, one must pre-compute
100 // some information.
101
102 ZZ_pXModulus F;
103 build(F, f);
104
105 PlainMul(r1, g, h); // this uses classical arithmetic
106 PlainRem(r1, r1, f);
107
108 MulMod(r2, g, h, F); // this uses the FFT
109
110 MulMod(r3, g, h, f); // uses FFT, but slower
111
112 // compare the results...
113
114 if (r1 != r2) {
115 printf("999999999999999 ");
116 print_flag();
117 return 0;
118 }
119 else if (r1 != r3) {
120 printf("999999999999999 ");
121 print_flag();
122 return 0;
123 }
124
125 double t;
126 long i;
127 long iter;
128
129 n = 1024;
130 k = 1600;
131 RandomLen(p, k);
132 if (!IsOdd(p)) p++;
133
134 ZZ_p::init(p);
135
136 ZZ_pX a;
137 random(a, n);
138 long da = deg(a);
139
140 ZZ_pXModRep modrep;
141 ToZZ_pXModRep(modrep, a, 0, da);
142
143 iter = 1;
144
145 do {
146 t = GetTime();
147 for (i = 0; i < iter; i++) {
148 ToZZ_pXModRep(modrep, a, 0, da);
149 }
150 t = GetTime() - t;
151 iter = 2*iter;
152 } while(t < 1);
153
154 iter = iter/2;
155
156 iter = long((3/t)*iter) + 1;
157
158 double tvec[5];
159 long w;
160
161 for (w = 0; w < 5; w++) {
162 t = GetTime();
163 for (i = 0; i < iter; i++) {
164 ToZZ_pXModRep(modrep, a, 0, da);
165 }
166 t = GetTime() - t;
167 tvec[w] = t;
168 }
169
170
171 t = clean_data(tvec);
172
173 t = floor((t/iter)*1e12);
174
175 if (t < 0 || t >= 1e15)
176 printf("999999999999999 ");
177 else
178 printf("%015.0f ", t);
179
180 printf(" [%ld] ", iter);
181
182 print_flag();
183
184 return 0;
185 }
0
1 #include <NTL/ZZ_pX.h>
2
3 #include <cstdio>
4
5 NTL_CLIENT
6
7
8 double clean_data(double *t)
9 {
10 double x, y, z;
11 long i, ix, iy, n;
12
13 x = t[0]; ix = 0;
14 y = t[0]; iy = 0;
15
16 for (i = 1; i < 5; i++) {
17 if (t[i] < x) {
18 x = t[i];
19 ix = i;
20 }
21 if (t[i] > y) {
22 y = t[i];
23 iy = i;
24 }
25 }
26
27 z = 0; n = 0;
28 for (i = 0; i < 5; i++) {
29 if (i != ix && i != iy) z+= t[i], n++;
30 }
31
32 z = z/n;
33
34 return z;
35 }
36
37 void print_flag()
38 {
39
40
41 #if (defined(NTL_CRT_ALTCODE))
42 printf("CRT_ALTCODE ");
43 #else
44 printf("DEFAULT ");
45 #endif
46
47
48 printf("\n");
49
50 }
51
52
53 int main()
54 {
55
56 #if (defined(NTL_CRT_ALTCODE) && !(defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG))
57
58 {
59 printf("999999999999999 ");
60 print_flag();
61 return 0;
62 }
63
64
65 #endif
66
67 SetSeed(ZZ(0));
68
69 long n, k;
70
71 n = 1024;
72 k = 30*NTL_SP_NBITS;
73
74 ZZ p;
75
76 RandomLen(p, k);
77 if (!IsOdd(p)) p++;
78
79
80 ZZ_p::init(p); // initialization
81
82 ZZ_pX f, g, h, r1, r2, r3;
83
84 random(g, n); // g = random polynomial of degree < n
85 random(h, n); // h = " "
86 random(f, n); // f = " "
87
88 SetCoeff(f, n); // Sets coefficient of X^n to 1
89
90 // For doing arithmetic mod f quickly, one must pre-compute
91 // some information.
92
93 ZZ_pXModulus F;
94 build(F, f);
95
96 PlainMul(r1, g, h); // this uses classical arithmetic
97 PlainRem(r1, r1, f);
98
99 MulMod(r2, g, h, F); // this uses the FFT
100
101 MulMod(r3, g, h, f); // uses FFT, but slower
102
103 // compare the results...
104
105 if (r1 != r2) {
106 printf("999999999999999 ");
107 print_flag();
108 return 0;
109 }
110 else if (r1 != r3) {
111 printf("999999999999999 ");
112 print_flag();
113 return 0;
114 }
115
116 double t;
117 long i;
118 long iter;
119
120 ZZ_pX a, b, c;
121 random(a, n);
122 random(b, n);
123 long da = deg(a);
124 long db = deg(b);
125 long dc = da + db;
126 long l = NextPowerOfTwo(dc+1);
127
128 FFTRep arep, brep, crep;
129 ToFFTRep(arep, a, l, 0, da);
130 ToFFTRep(brep, b, l, 0, db);
131
132 mul(crep, arep, brep);
133
134 ZZ_pXModRep modrep;
135 FromFFTRep(modrep, crep);
136
137 FromZZ_pXModRep(c, modrep, 0, dc);
138
139 iter = 1;
140
141 do {
142 t = GetTime();
143 for (i = 0; i < iter; i++) {
144 FromZZ_pXModRep(c, modrep, 0, dc);
145 }
146 t = GetTime() - t;
147 iter = 2*iter;
148 } while(t < 1);
149
150 iter = iter/2;
151
152 iter = long((3/t)*iter) + 1;
153
154 double tvec[5];
155 long w;
156
157 for (w = 0; w < 5; w++) {
158 t = GetTime();
159 for (i = 0; i < iter; i++) {
160 FromZZ_pXModRep(c, modrep, 0, dc);
161 }
162 t = GetTime() - t;
163 tvec[w] = t;
164 }
165
166
167 t = clean_data(tvec);
168
169 t = floor((t/iter)*1e12);
170
171 // The following is just to test some tuning Wizard logic --
172 // be sure to get rid of this!!
173 #if (defined(NTL_CRT_ALTCODE))
174 // t *= 1.12;
175 #endif
176
177 if (t < 0 || t >= 1e15)
178 printf("999999999999999 ");
179 else
180 printf("%015.0f ", t);
181
182 printf(" [%ld] ", iter);
183
184 print_flag();
185
186 return 0;
187 }
+0
-176
src/PolyTimeTest.c less more
0
1 #include <NTL/ZZ_pX.h>
2
3 #include <cstdio>
4
5 NTL_CLIENT
6
7
8 double clean_data(double *t)
9 {
10 double x, y, z;
11 long i, ix, iy, n;
12
13 x = t[0]; ix = 0;
14 y = t[0]; iy = 0;
15
16 for (i = 1; i < 5; i++) {
17 if (t[i] < x) {
18 x = t[i];
19 ix = i;
20 }
21 if (t[i] > y) {
22 y = t[i];
23 iy = i;
24 }
25 }
26
27 z = 0; n = 0;
28 for (i = 0; i < 5; i++) {
29 if (i != ix && i != iy) z+= t[i], n++;
30 }
31
32 z = z/n;
33
34 return z;
35 }
36
37 void print_flag()
38 {
39
40
41 #if (defined(NTL_TBL_REM))
42 printf("TBL_REM ");
43 #elif (defined(NTL_TBL_REM_LL))
44 printf("TBL_REM_LL ");
45 #else
46 printf("DEFAULT ");
47 #endif
48
49
50 printf("\n");
51
52 }
53
54
55 int main()
56 {
57
58 #if (defined(NTL_TBL_REM) && defined(NTL_GMP_LIP))
59
60 if (sizeof(NTL_ULL_TYPE) != 2*sizeof(long) ||
61 NTL_ZZ_NBITS != NTL_BITS_PER_LONG) {
62 printf("999999999999999 ");
63 print_flag();
64 return 0;
65 }
66
67
68 #endif
69
70 long n, k;
71
72 n = 200;
73 k = 10*NTL_ZZ_NBITS;
74
75 ZZ p;
76
77 RandomLen(p, k);
78
79
80 ZZ_p::init(p); // initialization
81
82 ZZ_pX f, g, h, r1, r2, r3;
83
84 random(g, n); // g = random polynomial of degree < n
85 random(h, n); // h = " "
86 random(f, n); // f = " "
87
88 SetCoeff(f, n); // Sets coefficient of X^n to 1
89
90 // For doing arithmetic mod f quickly, one must pre-compute
91 // some information.
92
93 ZZ_pXModulus F;
94 build(F, f);
95
96 PlainMul(r1, g, h); // this uses classical arithmetic
97 PlainRem(r1, r1, f);
98
99 MulMod(r2, g, h, F); // this uses the FFT
100
101 MulMod(r3, g, h, f); // uses FFT, but slower
102
103 // compare the results...
104
105 if (r1 != r2) {
106 printf("999999999999999 ");
107 print_flag();
108 return 0;
109 }
110 else if (r1 != r3) {
111 printf("999999999999999 ");
112 print_flag();
113 return 0;
114 }
115
116 double t;
117 long i;
118 long iter;
119
120 n = 1024;
121 k = 1024;
122 RandomLen(p, k);
123
124 ZZ_p::init(p);
125
126 ZZ_pX j1, j2, j3;
127
128 random(j1, n);
129 random(j2, n);
130
131 mul(j3, j1, j2);
132
133 iter = 1;
134
135 do {
136 t = GetTime();
137 for (i = 0; i < iter; i++) {
138 FFTMul(j3, j1, j2);
139 }
140 t = GetTime() - t;
141 iter = 2*iter;
142 } while(t < 1);
143
144 iter = iter/2;
145
146 iter = long((2/t)*iter) + 1;
147
148 double tvec[5];
149 long w;
150
151 for (w = 0; w < 5; w++) {
152 t = GetTime();
153 for (i = 0; i < iter; i++) {
154 FFTMul(j3, j1, j2);
155 }
156 t = GetTime() - t;
157 tvec[w] = t;
158 }
159
160
161 t = clean_data(tvec);
162
163 t = floor((t/iter)*1e12);
164
165 if (t < 0 || t >= 1e15)
166 printf("999999999999999 ");
167 else
168 printf("%015.0f ", t);
169
170 printf(" [%ld] ", iter);
171
172 print_flag();
173
174 return 0;
175 }
100100 }
101101
102102
103 ZZX SSMul(const ZZX& a, const ZZX& b)
103
104 ZZX KarMul(const ZZX& a, const ZZX& b)
104105 {
105106 ZZX res;
106 SSMul(res, a, b);
107 KarMul(res, a, b);
107108 return res;
108109 }
110
109111
110112
111113 int main()
120122 cerr << "NTL_ZZ_NBITS = " << NTL_ZZ_NBITS << "\n";
121123 cerr << "NTL_SP_NBITS = " << NTL_SP_NBITS << "\n";
122124
125 #ifdef NTL_HAVE_LL_TYPE
126 cerr << "NTL_HAVE_LL_TYPE\n";
127 #endif
128
129 #ifdef NTL_HAVE_BUILTIN_CLZL
130 cerr << "NTL_HAVE_BUILTIN_CLZL\n";
131 #endif
132
133 #ifdef NTL_HAVE_AVX
134 cerr << "NTL_HAVE_AVX\n";
135 #endif
136
137 #ifdef NTL_HAVE_FMA
138 cerr << "NTL_HAVE_FMA\n";
139 #endif
140
141
142
123143 #ifdef NTL_LONGDOUBLE_SP_MULMOD
124144 cerr << "NTL_LONGDOUBLE_SP_MULMOD\n";
125145 #endif
151171 cerr << "NTL_THREADS\n";
152172 #endif
153173
174 #ifdef NTL_DISABLE_TLS_HACK
175 cerr << "NTL_DISABLE_TLS_HACK\n";
176 #endif
177
178 #ifdef NTL_ENABLE_TLS_HACK
179 cerr << "NTL_ENABLE_TLS_HACK\n";
180 #endif
181
154182
155183 #ifdef NTL_EXCEPTIONS
156184 cerr << "NTL_EXCEPTIONS\n";
157185 #endif
158186
187 #ifdef NTL_THREAD_BOOST
188 cerr << "NTL_THREAD_BOOST\n";
189 #endif
190
159191
160192 #ifdef NTL_LEGACY_SP_MULMOD
161 cout << "NTL_LEGACY_SP_MULMOD\n";
193 cerr << "NTL_LEGACY_SP_MULMOD\n";
162194 #endif
163195
164196
165197 #ifdef NTL_DISABLE_LONGDOUBLE
166 cout << "NTL_DISABLE_LONGDOUBLE\n";
198 cerr << "NTL_DISABLE_LONGDOUBLE\n";
167199 #endif
168200
169201
170202 #ifdef NTL_DISABLE_LONGLONG
171 cout << "NTL_DISABLE_LONGLONG\n";
203 cerr << "NTL_DISABLE_LONGLONG\n";
204 #endif
205
206 #ifdef NTL_DISABLE_LL_ASM
207 cerr << "NTL_DISABLE_LL_ASM\n";
208 #endif
209
210 #ifdef NTL_MAXIMIZE_SP_NBITS
211 cerr << "NTL_MAXIMIZE_SP_NBITS\n";
172212 #endif
173213
174214
183223 cerr << "NTL_GF2X_LIB\n";
184224 #endif
185225
186
187 #ifdef NTL_PCLMUL
188 cerr << "NTL_PCLMUL\n";
189 #endif
190226
191227
192228 #ifdef NTL_LONG_LONG_TYPE
261297
262298
263299 #ifdef NTL_FFT_BIGTAB
264 cout << "NTL_FFT_BIGTAB\n";
300 cerr << "NTL_FFT_BIGTAB\n";
265301 #endif
266302
267303 #ifdef NTL_FFT_LAZYMUL
268 cout << "NTL_FFT_LAZYMUL\n";
269 #endif
270
271
272
304 cerr << "NTL_FFT_LAZYMUL\n";
305 #endif
273306
274307
275308 #ifdef NTL_TBL_REM
281314 cerr << "NTL_TBL_REM_LL\n";
282315 #endif
283316
317 #ifdef NTL_CRT_ALTCODE
318 cerr << "NTL_CRT_ALTCODE\n";
319 #endif
320
321 #ifdef NTL_CRT_ALTCODE_SMALL
322 cerr << "NTL_CRT_ALTCODE_SMALL\n";
323 #endif
284324
285325 #ifdef NTL_GF2X_ALTCODE
286326 cerr << "NTL_GF2X_ALTCODE\n";
290330 cerr << "NTL_GF2X_ALTCODE1\n";
291331 #endif
292332
293
294333 #ifdef NTL_GF2X_NOINLINE
295334 cerr << "NTL_GF2X_NOINLINE\n";
296335 #endif
336
337 #ifdef NTL_PCLMUL
338 cerr << "NTL_PCLMUL\n";
339 #endif
340
297341
298342 cerr << "\n\n";
299343
312356 cerr << ".";
313357 RandomLen(p, k);
314358 ZZ_p::init(p);
359
315360
316361 ZZ_pX a, b, c, c1;
317362
320365 random(b, n);
321366
322367 FFTMul(c, a, b);
323
324 c1 = conv<ZZ_pX>( SSMul( conv<ZZX>(a), conv<ZZX>(b) ) );
368 //cerr << ZZ_pInfo->FFTInfo->NumPrimes;
369
370 c1 = conv<ZZ_pX>( KarMul( conv<ZZX>(a), conv<ZZX>(b) ) );
325371
326372 if (c1 != c) {
327373 cerr << "ZZ_pX mul failed!\n";
408454 RandomLen(p, k);
409455
410456 ZZ_p::init(p);
457 if (!IsOdd(p)) p++;
411458
412459 ZZ_pX j1, j2, j3;
413460
417464 mul(j3, j1, j2);
418465
419466 t = GetTime();
420 for (i = 0; i < 100; i++) mul(j3, j1, j2);
467 for (i = 0; i < 500; i++) mul(j3, j1, j2);
421468 t = GetTime()-t;
422469
423470 cerr << "time to multiply degree 1023 polynomials\n modulo a 1024-bit number: ";
424 cerr << (t/100) << "s";
471 cerr << (t/500) << "s";
425472 cerr << "\n";
426473
427474 GF2X_time();
1313 // priority right now.
1414
1515
16 NTL_THREAD_LOCAL
16 NTL_CHEAP_THREAD_LOCAL
1717 long RR::prec = 150;
1818
1919 void RR::SetPrecision(long p)
2727 prec = p;
2828 }
2929
30 NTL_THREAD_LOCAL
30 NTL_CHEAP_THREAD_LOCAL
3131 long RR::oprec = 10;
3232
3333 void RR::SetOutputPrecision(long p)
107107
108108 void random(RR& z)
109109 {
110 NTL_THREAD_LOCAL static RR t;
110 NTL_TLS_LOCAL(RR, t);
111111 RandomBits(t.x, RR::prec);
112112 t.e = -RR::prec;
113113 normalize(z, t);
176176
177177 void add(RR& z, const RR& a, const RR& b)
178178 {
179 NTL_THREAD_LOCAL static RR t;
179 NTL_TLS_LOCAL(RR, t);
180180
181181 if (IsZero(a.x)) {
182182 xcopy(z, b);
229229
230230 void sub(RR& z, const RR& a, const RR& b)
231231 {
232 NTL_THREAD_LOCAL static RR t;
232 NTL_TLS_LOCAL(RR, t);
233233
234234 if (IsZero(a.x)) {
235235 negate(z, b);
321321
322322 void mul(RR& z, const RR& a, const RR& b)
323323 {
324 NTL_THREAD_LOCAL static RR t;
324 NTL_TLS_LOCAL(RR, t);
325325
326326 mul(t.x, a.x, b.x);
327327 t.e = a.e + b.e;
343343
344344 void sqr(RR& z, const RR& a)
345345 {
346 NTL_THREAD_LOCAL static RR t;
346 NTL_TLS_LOCAL(RR, t);
347347
348348 sqr(t.x, a.x);
349349 t.e = a.e + a.e;
382382 long k = RR::prec - la + lb + 1;
383383 if (k < 0) k = 0;
384384
385 NTL_THREAD_LOCAL static RR t;
385 NTL_TLS_LOCAL(RR, t);
386386 NTL_ZZRegister(A);
387387 NTL_ZZRegister(B);
388388 NTL_ZZRegister(R);
466466
467467 long compare(const RR& a, const RR& b)
468468 {
469 NTL_THREAD_LOCAL static RR t;
469 NTL_TLS_LOCAL(RR, t);
470470
471471 SubPrec(t, a, b, 1);
472472 return sign(t);
482482
483483 void trunc(RR& z, const RR& a)
484484 {
485 NTL_THREAD_LOCAL static RR t;
485 NTL_TLS_LOCAL(RR, t);
486486
487487 if (a.e >= 0)
488488 xcopy(z, a);
507507
508508 void floor(RR& z, const RR& a)
509509 {
510 NTL_THREAD_LOCAL static RR t;
510 NTL_TLS_LOCAL(RR, t);
511511
512512 if (a.e >= 0)
513513 xcopy(z, a);
534534
535535 void ceil(RR& z, const RR& a)
536536 {
537 NTL_THREAD_LOCAL static RR t;
537 NTL_TLS_LOCAL(RR, t);
538538
539539 if (a.e >= 0)
540540 xcopy(z, a);
582582 return;
583583 }
584584
585 NTL_THREAD_LOCAL static RR t;
585 NTL_TLS_LOCAL(RR, t);
586586 ConvPrec(t, a, len+a.e);
587587 xcopy(z, t);
588588 }
696696
697697 int e;
698698 double f;
699 NTL_THREAD_LOCAL static RR t;
699 NTL_TLS_LOCAL(RR, t);
700700
701701 f = frexp(a, &e);
702702
778778 return;
779779 }
780780
781 NTL_THREAD_LOCAL static RR t;
781 NTL_TLS_LOCAL(RR, t);
782782
783783 ConvPrec(t, a, len+a.e);
784784
800800 void conv(double& z, const RR& aa)
801801 {
802802 double x;
803 NTL_THREAD_LOCAL static RR a;
803 NTL_TLS_LOCAL(RR, a);
804804
805805 ConvPrec(a, aa, NTL_DOUBLE_PRECISION);
806806 // round to NTL_DOUBLE_PRECISION bits to avoid double overflow
814814
815815 void add(RR& z, const RR& a, double b)
816816 {
817 NTL_THREAD_LOCAL static RR B;
817 NTL_TLS_LOCAL(RR, B);
818818 B = b;
819819 add(z, a, B);
820820 }
823823
824824 void sub(RR& z, const RR& a, double b)
825825 {
826 NTL_THREAD_LOCAL static RR B;
826 NTL_TLS_LOCAL(RR, B);
827827 B = b;
828828 sub(z, a, B);
829829 }
830830
831831 void sub(RR& z, double a, const RR& b)
832832 {
833 NTL_THREAD_LOCAL static RR A;
833 NTL_TLS_LOCAL(RR, A);
834834 A = a;
835835 sub(z, A, b);
836836 }
839839
840840 void mul(RR& z, const RR& a, double b)
841841 {
842 NTL_THREAD_LOCAL static RR B;
842 NTL_TLS_LOCAL(RR, B);
843843 B = b;
844844 mul(z, a, B);
845845 }
847847
848848 void div(RR& z, const RR& a, double b)
849849 {
850 NTL_THREAD_LOCAL static RR B;
850 NTL_TLS_LOCAL(RR, B);
851851 B = b;
852852 div(z, a, B);
853853 }
854854
855855 void div(RR& z, double a, const RR& b)
856856 {
857 NTL_THREAD_LOCAL static RR A;
857 NTL_TLS_LOCAL(RR, A);
858858 A = a;
859859 div(z, A, b);
860860 }
862862
863863 void inv(RR& z, const RR& a)
864864 {
865 NTL_THREAD_LOCAL static RR one = to_RR(1);
865 NTL_TLS_LOCAL_INIT(RR, one, (to_RR(1)));
866866 div(z, one, a);
867867 }
868868
883883 {
884884 if (b == 0) return sign(a);
885885
886 NTL_THREAD_LOCAL static RR B;
886 NTL_TLS_LOCAL(RR, B);
887887 B = b;
888888 return compare(a, B);
889889 }
894894 if (b == 0) return IsZero(a);
895895 if (b == 1) return IsOne(a);
896896
897 NTL_THREAD_LOCAL static RR B;
897 NTL_TLS_LOCAL(RR, B);
898898 B = b;
899899 return a == B;
900900 }
11481148
11491149 void conv(RR& z, const quad_float& a)
11501150 {
1151 NTL_THREAD_LOCAL static RR hi, lo, res;
1151 NTL_TLS_LOCAL(RR, hi);
1152 NTL_TLS_LOCAL(RR, lo);
1153 NTL_TLS_LOCAL(RR, res);
11521154
11531155 ConvPrec(hi, a.hi, NTL_DOUBLE_PRECISION);
11541156 ConvPrec(lo, a.lo, NTL_DOUBLE_PRECISION);
11731175
11741176 void conv(quad_float& z, const RR& a)
11751177 {
1176 NTL_THREAD_LOCAL static RR a_hi, a_lo;
1178 NTL_TLS_LOCAL(RR, a_hi);
1179 NTL_TLS_LOCAL(RR, a_lo);
11771180
11781181 ConvPrec(a_hi, a, NTL_DOUBLE_PRECISION); // high order bits
11791182 SubPrec(a_lo, a, a_hi, NTL_DOUBLE_PRECISION); // low order bits
13651368
13661369 void ComputeE(RR& res)
13671370 {
1368 NTL_THREAD_LOCAL static long prec = 0;
1369 NTL_THREAD_LOCAL static RR e;
1371 static NTL_CHEAP_THREAD_LOCAL long prec = 0;
1372
1373 NTL_TLS_LOCAL(RR, e);
13701374
13711375 RRPush push;
13721376 long p = RR::precision();
14691473
14701474 void ComputeLn2(RR& res)
14711475 {
1472 NTL_THREAD_LOCAL static long prec = 0;
1473 NTL_THREAD_LOCAL static RR ln2;
1476 static NTL_CHEAP_THREAD_LOCAL long prec = 0;
1477
1478 NTL_TLS_LOCAL(RR, ln2);
14741479
14751480 RRPush push;
14761481 long p = RR::precision();
15541559
15551560 void ComputeLn10(RR& res)
15561561 {
1557 NTL_THREAD_LOCAL static long prec = 0;
1558 NTL_THREAD_LOCAL static RR ln10;
1562 static NTL_CHEAP_THREAD_LOCAL long prec = 0;
1563
1564 NTL_TLS_LOCAL(RR, ln10);
15591565
15601566 RRPush push;
15611567 long p = RR::precision();
17901796
17911797 void ComputePi(RR& res)
17921798 {
1793 NTL_THREAD_LOCAL static long prec = 0;
1794 NTL_THREAD_LOCAL static RR pi;
1799 static NTL_CHEAP_THREAD_LOCAL long prec = 0;
1800
1801 NTL_TLS_LOCAL(RR, pi);
17951802
17961803 RRPush push;
17971804 long p = RR::precision();
0
1 echo "" > "$1/include/NTL/HAVE_LL_TYPE.h"
2 echo "" > "$1/include/NTL/HAVE_BUILTIN_CLZL.h"
3 echo "" > "$1/include/NTL/HAVE_AVX.h"
4 echo "" > "$1/include/NTL/HAVE_FMA.h"
7979 else
8080 echo "bad MatrixTest"
8181 fi
82
83 echo
84 echo "---------------------------------"
85 echo "making mat_lzz_pTest"
86 make mat_lzz_pTest
87 echo "running mat_lzz_pTest"
88 ./mat_lzz_pTest
89 sh RemoveProg mat_lzz_pTest
90
8291
8392 echo
8493 echo "---------------------------------"
22 #ifdef NTL_THREADS
33
44
5 #include <NTL/ZZX.h>
56 #include <NTL/ZZ_pXFactoring.h>
6 #include <NTL/thread.h>
7
8 #include <thread>
9
10
7 #include <NTL/BasicThreadPool.h>
118 #include <cstdio>
129
1310 NTL_CLIENT
1411
12 #if 1
1513
1614
17 void task(ZZ_pContext context, ZZ_pX *f, vec_pair_ZZ_pX_long *v)
15 long mobius(long n)
1816 {
19 fprintf(stderr, "starting %s\n", CurrentThreadID().c_str());
20 context.restore();
21 CanZass(*v, *f);
22 fprintf(stderr, "stopping %s\n", CurrentThreadID().c_str());
17 long p,e,arity=0;
18 PrimeSeq s;
19 while (n!=1)
20 { p=s.next();
21 e=0;
22 while ((n%p==0)) { n=n/p; e++; }
23 if (e>1) { return 0; }
24 if (e!=0) { arity^=1; }
25 }
26 if (arity==0) { return 1; }
27 return -1;
2328 }
29
30
31 ZZX Cyclotomic(long N)
32 {
33 ZZX Num,Den,G,F;
34 set(Num); set(Den);
35 long m,d;
36 for (d=1; d<=N; d++)
37 { if ((N%d)==0)
38 { clear(G);
39 SetCoeff(G,N/d,1); SetCoeff(G,0,-1);
40 m=mobius(d);
41 if (m==1) { Num*=G; }
42 else if (m==-1) { Den*=G; }
43 }
44 }
45 F=Num/Den;
46 return F;
47 }
48
49 long multOrd(const ZZ& p, long m)
50 {
51 long pp = rem(p, m);
52 if (GCD(pp, m) != 1) return 0;
53
54 long ord = 1;
55 long val = pp;
56 while (val != 1) {
57 ord++;
58 val = MulMod(val, pp, m);
59 }
60 return ord;
61 }
62
63 #endif
64
65
66
2467
2568
2669 int main()
2770 {
71 SetSeed(ZZ(0));
72
2873 long NumContexts = 3;
2974 long NumPolys = 6;
30 long n = 500;
75 long n = 2000;
3176
3277 Vec<ZZ_pContext> context_vec;
3378 context_vec.SetLength(NumContexts);
3479
35 long i;
36 for (i = 0; i < NumContexts; i++) {
80 for (long i = 0; i < NumContexts; i++) {
3781 ZZ p;
38 RandomPrime(p, 150 + i*50);
82 GenPrime(p, 150 + i*20);
3983 context_vec[i] = ZZ_pContext(p);
4084 }
4185
4286 Vec<ZZ_pX> poly_vec;
4387 Vec<vec_pair_ZZ_pX_long> res_vec;
44 Vec< SmartPtr<thread> > thread_vec;
4588
4689 poly_vec.SetLength(NumPolys);
4790 res_vec.SetLength(NumPolys);
48 thread_vec.SetLength(NumPolys);
4991
50 for (i = 0; i < NumPolys; i++) {
51 ZZ_pPush push(context_vec[i % NumContexts]);
52 random(poly_vec[i], n);
53 SetCoeff(poly_vec[i], n);
92
93 for (long i = 0; i < NumPolys; i++) {
94 context_vec[i % NumContexts].restore();
95 ZZX f = Cyclotomic(n+i);
96 conv(poly_vec[i], f);
5497 }
98
5599
56100 cerr << "START\n";
57101
58 for (i = 0; i < NumPolys; i++)
59 thread_vec[i] = MakeSmart<thread>(task, context_vec[i % NumContexts],
60 &poly_vec[i], &res_vec[i]);
102 BasicThreadPool pool(NumPolys);
61103
62 for (i = 0; i < NumPolys; i++)
63 thread_vec[i]->join();
104 pool.exec_index(NumPolys,
105 [&](long i) {
106 fprintf(stderr, "starting %ld: %s\n", i, CurrentThreadID().c_str());
107 context_vec[i % NumContexts].restore();
108 CanZass(res_vec[i], poly_vec[i]);
109 fprintf(stderr, "stopping %ld: %s\n", i, CurrentThreadID().c_str());
110 });
64111
65112 cerr << "checking results...\n";
66113
67114
68 for (i = 0; i < NumPolys; i++) {
69 ZZ_pPush push(context_vec[i % NumContexts]);
70 vec_pair_ZZ_pX_long v;
71 berlekamp(v, poly_vec[i]);
72 if (v.length() == res_vec[i].length() && mul(v) == mul(res_vec[i]))
115 for (long i = 0; i < NumPolys; i++) {
116 context_vec[i % NumContexts].restore();
117 if (res_vec[i].length() == deg(poly_vec[i])/multOrd(ZZ_p::modulus(), n+i))
73118 cerr << i << " GOOD\n";
74119 else
75120 cerr << i << " BAD\n";
0 16:0:0
0 26:0:0
0 WinNTL-9_3_0
0 WinNTL-9_9_0
2727 mkdir small/include/NTL
2828
2929 cp MulTimeTest.c small/src
30 cp PolyTimeTest.c small/src
3130 cp Poly1TimeTest.c small/src
31 cp Poly2TimeTest.c small/src
32 cp Poly3TimeTest.c small/src
3233 cp GF2XTimeTest.c small/src
3334 cp InitSettings.c small/src
3435 cp DispSettings.c small/src
5455 cp GF2X.c small/src
5556 cp GF2X1.c small/src
5657 cp thread.c small/src
58 cp BasicThreadPool.c small/src
5759 cp fileio.c small/src
5860
5961
6062
63 sh CopyFeatures '..' small
6164 cp ../include/NTL/FFT.h small/include/NTL
6265 cp ../include/NTL/SPMM_ASM.h small/include/NTL
6366 cp ../include/NTL/ctools.h small/include/NTL
64 cp ../include/NTL/have_LL.h small/include/NTL
65 cp ../include/NTL/have_builtin_clzl.h small/include/NTL
6667 cp ../include/NTL/ZZ.h small/include/NTL
6768 cp ../include/NTL/sp_arith.h small/include/NTL
6869 cp ../include/NTL/ZZVec.h small/include/NTL
7980 cp ../include/NTL/Lazy.h small/include/NTL
8081 cp ../include/NTL/LazyTable.h small/include/NTL
8182 cp ../include/NTL/thread.h small/include/NTL
83 cp ../include/NTL/BasicThreadPool.h small/include/NTL
8284 cp ../include/NTL/fileio.h small/include/NTL
8385 cp ../include/NTL/tools.h small/include/NTL
8486 cp ../include/NTL/vec_ZZ.h small/include/NTL
103105
104106 echo "*"
105107 echo "*"
106 echo "* Updating config.h"
108 echo "* Updating config.h and wizard_log.h"
107109 echo "*"
108110 echo "*"
109111
110112 cp small/include/NTL/config.h ../include/NTL/config.h
113 cp small/src/wizard_log.h ../include/NTL/wizard_log.h
111114
112115 rm -r small
113116
8484 'NTL_SPMM_ASM' => 0,
8585 'NTL_TBL_REM' => 0,
8686 'NTL_TBL_REM_LL' => 0,
87 'NTL_CRT_ALTCODE' => 0,
88 'NTL_CRT_ALTCODE_SMALL'=> 0,
8789 'NTL_AVOID_BRANCHING' => 0,
8890 'NTL_GF2X_ALTCODE' => 0,
8991 'NTL_GF2X_ALTCODE1' => 0,
9092 'NTL_GF2X_NOINLINE' => 0,
93 'NTL_PCLMUL' => 0,
9194 'NTL_FFT_BIGTAB' => 0,
9295 'NTL_FFT_LAZYMUL' => 0,
9396
182185 GenConfigHeader();
183186 $time1 = RunProg("Poly1TimeTest");
184187
185 if ($time1*1.0 > $time*1.05) {
188 if ($time1*1.0 > $time*1.04) {
186189 # stick with BIGTABs
187190 $Config{"NTL_FFT_BIGTAB"} = 1;
188191 }
195198 unlink("lip.o");
196199
197200
198 if ($Config{"NTL_PCLMUL"} == 0) {
199
200 # set the flags GF2X_NOINLINE and GF2X_ALTCODE...try all pairs
201 # bit don't bother with this if PCLMUL is enabled
202
203 $time = "999999999999999";
204 $aflag = "default";
205 $bflag = "default";
206
207 foreach $aflag1 ("default", "NTL_GF2X_NOINLINE") {
208 foreach $bflag1 ("default", "NTL_GF2X_ALTCODE", "NTL_GF2X_ALTCODE1") {
209
210 $Config{$aflag1} = 1;
211 $Config{$bflag1} = 1;
212 GenConfigHeader();
213 $time1 = RunProg("GF2XTimeTest");
214
215 if ($time1 < $time) {
216 $aflag = $aflag1;
217 $bflag = $bflag1;
218 $time = $time1;
219 }
220
221 $Config{$aflag1} = 0;
222 $Config{$bflag1} = 0;
223 unlink("GF2X.o");
224 }
225 }
226
201
202
203 # set flags NTL_GF2X_NOINLINE, NTL_GF2X_ALTCODE, NTL_GF2X_ALTCODE1
204
205 $time = "999999999999999";
206 $aflag = "default";
207 $bflag = "default";
208
209 foreach $aflag1 ("default", "NTL_GF2X_NOINLINE") {
210 foreach $bflag1 ("default", "NTL_GF2X_ALTCODE", "NTL_GF2X_ALTCODE1") {
211
212 $Config{$aflag1} = 1;
213 $Config{$bflag1} = 1;
214 GenConfigHeader();
215 $time1 = RunProg("GF2XTimeTest");
216
217 if ($time1 < $time) {
218 $aflag = $aflag1;
219 $bflag = $bflag1;
220 $time = $time1;
221 }
222
223 $Config{$aflag1} = 0;
224 $Config{$bflag1} = 0;
225 unlink("GF2X.o");
226 }
227 }
228
229
230
231 # now try NTL_PCLMUL instead
232 unlink("GF2X.o");
233 unlink("GF2X1.o");
234 $Config{"NTL_PCLMUL"} = 1;
235 GenConfigHeader();
236 $time1 = RunProg("GF2XTimeTest");
237 unlink("GF2X.o");
238 unlink("GF2X1.o");
239 if ($time1 >= $time) {
240 $Config{"NTL_PCLMUL"} = 0;
227241 $Config{$aflag} = 1;
228242 $Config{$bflag} = 1;
229
230
231 }
232
233
243 }
234244
235245 if ($Config{"NTL_GMP_LIP"} == 0) {
236246
265275 foreach $flag1 ("default", "NTL_TBL_REM", "NTL_TBL_REM_LL") {
266276 $Config{$flag1} = 1;
267277 GenConfigHeader();
268 $time1 = RunProg("PolyTimeTest");
278 $time1 = RunProg("Poly2TimeTest");
269279
270280 if ($time1 < $time) {
271281 $flag = $flag1;
288298 foreach $flag1 ("default", "NTL_TBL_REM") {
289299 $Config{$flag1} = 1;
290300 GenConfigHeader();
291 $time1 = RunProg("PolyTimeTest");
301 $time1 = RunProg("Poly2TimeTest");
292302
293303 if ($time1 < $time) {
294304 $flag = $flag1;
301311
302312 $Config{$flag} = 1;
303313
314
315 # set NTL_CRT_ALTCODE
316
317 $time = "999999999999999";
318 $flag = "default";
319
320 foreach $flag1 ("default", "NTL_CRT_ALTCODE") {
321 $Config{$flag1} = 1;
322 GenConfigHeader();
323 $time1 = RunProg("Poly3TimeTest");
324
325 if ($time1 < $time) {
326 $flag = $flag1;
327 $time = $time1;
328 }
329
330 $Config{$flag1} = 0;
331 unlink("lip.o");
332 }
333
334 $Config{$flag} = 1;
335
336 # set NTL_CRT_ALTCODE_SMALL, if NTL_CRT_ALTCODE
337 # not set but it did not perform too badly
338
339 if ($Config{"NTL_CRT_ALTCODE"} == 0) {
340 # time measures default and time1 measures ALTCODE
341 if (1.0*$time1 < 1.15*$time) {
342 $Config{"NTL_CRT_ALTCODE_SMALL"} = 1;
343 }
344 }
345
346
347
304348 }
305349
306350 $Config{'WIZARD_HACK'} = "";
310354
311355 system("make DispSettings");
312356 system("./DispSettings");
313
314
357 system("./DispSettings > wizard_log.h");
358
359
123123
124124 void CopySwap(WordVector& x, WordVector& y)
125125 {
126 NTL_THREAD_LOCAL static WordVector t;
126 NTL_TLS_LOCAL(WordVector, t);
127127 WordVectorWatcher watch_t(t);
128128
129129 long sz_x = x.length();
+674
-520
src/ZZ.c less more
33 #include <NTL/Lazy.h>
44 #include <NTL/fileio.h>
55
6 #include <cstring>
7
68
79
810 NTL_START_IMPL
1315
1416 const ZZ& ZZ::zero()
1517 {
16 NTL_THREAD_LOCAL static ZZ z;
18
19 static const ZZ z; // GLOBAL (relies on C++11 thread-safe init)
1720 return z;
1821 }
1922
2023
2124 const ZZ& ZZ_expo(long e)
2225 {
23 NTL_THREAD_LOCAL static ZZ expo_helper;
26 NTL_TLS_LOCAL(ZZ, expo_helper);
27
2428 conv(expo_helper, e);
2529 return expo_helper;
2630 }
5357
5458 // ****** input and output
5559
56 NTL_THREAD_LOCAL static long iodigits = 0;
57 NTL_THREAD_LOCAL static long ioradix = 0;
58
60
61 static NTL_CHEAP_THREAD_LOCAL long iodigits = 0;
62 static NTL_CHEAP_THREAD_LOCAL long ioradix = 0;
5963 // iodigits is the greatest integer such that 10^{iodigits} < NTL_WSP_BOUND
6064 // ioradix = 10^{iodigits}
6165
172176 static
173177 void PrintDigits(ostream& s, long d, long justify)
174178 {
175 NTL_THREAD_LOCAL static Vec<char> buf(INIT_SIZE, iodigits);
179 NTL_TLS_LOCAL_INIT(Vec<char>, buf, (INIT_SIZE, iodigits));
176180
177181 long i = 0;
178182
320324 t = v1;
321325 }
322326
327 long InvModStatus(long& x, long a, long n)
328 {
329 long d, s, t;
330
331 XGCD(d, s, t, a, n);
332 if (d != 1) {
333 x = d;
334 return 1;
335 }
336 else {
337 if (s < 0)
338 x = s + n;
339 else
340 x = s;
341
342 return 0;
343 }
344 }
323345
324346 long InvMod(long a, long n)
325347 {
13691391
13701392
13711393
1372 // RANDOM NUMBER GENERATION
1373
1374 // Idea for this PRNG. Iteratively hash seed using md5
1375 // to get 256 bytes to initialize arc4.
1376 // Then use arc4 to get a pseudo-random byte stream.
1377
1378 // I've taken care that the pseudo-random numbers generated by
1379 // the routines RandomBnd, RandomBits, and RandomLen
1380 // are completely platform independent.
1381
1382 // I make use of the md5 compression function,
1383 // which I've modified to work on 64-bit machines
1384
1385
1386 /*
1387 * BEGIN RSA's md5 stuff
1388 *
1389 */
1390
1391 /*
1392 **********************************************************************
1393 ** md5.c **
1394 ** RSA Data Security, Inc. MD5 Message Digest Algorithm **
1395 ** Created: 2/17/90 RLR **
1396 ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version **
1397 **********************************************************************
1398 */
1399
1400 /*
1401 **********************************************************************
1402 ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
1403 ** **
1404 ** License to copy and use this software is granted provided that **
1405 ** it is identified as the "RSA Data Security, Inc. MD5 Message **
1406 ** Digest Algorithm" in all material mentioning or referencing this **
1407 ** software or this function. **
1408 ** **
1409 ** License is also granted to make and use derivative works **
1410 ** provided that such works are identified as "derived from the RSA **
1411 ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
1412 ** material mentioning or referencing the derived work. **
1413 ** **
1414 ** RSA Data Security, Inc. makes no representations concerning **
1415 ** either the merchantability of this software or the suitability **
1416 ** of this software for any particular purpose. It is provided "as **
1417 ** is" without express or implied warranty of any kind. **
1418 ** **
1419 ** These notices must be retained in any copies of any part of this **
1420 ** documentation and/or software. **
1421 **********************************************************************
1422 */
1423
1424
1425 #if (NTL_BITS_PER_LONG <= 32)
1426 #define TRUNC32(x) (x)
1394 // ======================= new PRG stuff ======================
1395
1396
1397
1398
1399 #if (NTL_BITS_PER_INT32 == 32)
1400 #define INT32MASK(x) (x)
14271401 #else
1428 #define TRUNC32(x) ((x) & ((1UL << 32)-1UL))
1402 #define INT32MASK(x) ((x) & _ntl_uint32(0xffffffff))
14291403 #endif
14301404
1431 /* F, G and H are basic MD5 functions: selection, majority, parity */
1432 #define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
1433 #define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
1434 #define H(x, y, z) ((x) ^ (y) ^ (z))
1435 #define I(x, y, z) (TRUNC32((y) ^ ((x) | (~z))))
1436
1437 /* ROTATE_LEFT rotates x left n bits */
1438 #define ROTATE_LEFT(x, n) (TRUNC32(((x) << (n)) | ((x) >> (32-(n)))))
1439
1440 /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
1441 /* Rotation is separate from addition to prevent recomputation */
1442 #define FF(a, b, c, d, x, s, ac) \
1443 {(a) = TRUNC32((a) + F((b), (c), (d)) + (x) + (ac)); \
1444 (a) = ROTATE_LEFT((a), (s)); \
1445 (a) = TRUNC32((a) + (b)); \
1446 }
1447 #define GG(a, b, c, d, x, s, ac) \
1448 {(a) = TRUNC32((a) + G((b), (c), (d)) + (x) + (ac)); \
1449 (a) = ROTATE_LEFT((a), (s)); \
1450 (a) = TRUNC32((a) + (b)); \
1451 }
1452 #define HH(a, b, c, d, x, s, ac) \
1453 {(a) = TRUNC32((a) + H((b), (c), (d)) + (x) + (ac)); \
1454 (a) = ROTATE_LEFT((a), (s)); \
1455 (a) = TRUNC32((a) + (b)); \
1456 }
1457 #define II(a, b, c, d, x, s, ac) \
1458 {(a) = TRUNC32((a) + I((b), (c), (d)) + (x) + (ac)); \
1459 (a) = ROTATE_LEFT((a), (s)); \
1460 (a) = TRUNC32((a) + (b)); \
1461 }
1462
1405
1406
1407 // SHA256 code adapted from an implementauin by Brad Conte.
1408 // The following is from his original source files.
1409 /*********************************************************************
1410 * Filename: sha256.c
1411 * Author: Brad Conte (brad AT bradconte.com)
1412 * Copyright:
1413 * Disclaimer: This code is presented "as is" without any guarantees.
1414 * Details: Implementation of the SHA-256 hashing algorithm.
1415 SHA-256 is one of the three algorithms in the SHA2
1416 specification. The others, SHA-384 and SHA-512, are not
1417 offered in this implementation.
1418 Algorithm specification can be found here:
1419 * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf
1420 This implementation uses little endian byte order.
1421 *********************************************************************/
1422
1423
1424
1425
1426 #define SHA256_BLOCKSIZE (64)
1427 #define SHA256_HASHSIZE (32)
1428
1429 // DBL_INT_ADD treats two unsigned ints a and b as one 64-bit integer and adds c to it
1430 static inline
1431 void DBL_INT_ADD(_ntl_uint32& a, _ntl_uint32& b, _ntl_uint32 c)
1432 {
1433 _ntl_uint32 aa = INT32MASK(a);
1434 if (aa > INT32MASK(_ntl_uint32(0xffffffff) - c)) b++;
1435 a = aa + c;
1436 }
1437
1438 #define ROTLEFT(a,b) (((a) << (b)) | (INT32MASK(a) >> (32-(b))))
1439 #define ROTRIGHT(a,b) ((INT32MASK(a) >> (b)) | ((a) << (32-(b))))
1440
1441 #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
1442 #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
1443 #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
1444 #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
1445 #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (INT32MASK(x) >> 3))
1446 #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (INT32MASK(x) >> 10))
1447
1448 struct SHA256_CTX {
1449 unsigned char data[64];
1450 _ntl_uint32 datalen;
1451 _ntl_uint32 bitlen[2];
1452 _ntl_uint32 state[8];
1453 };
1454
1455 static const _ntl_uint32 sha256_const[64] = {
1456 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1457 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1458 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1459 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1460 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1461 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1462 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1463 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1464 };
14631465
14641466
14651467 static
1466 void MD5_default_IV(unsigned long *buf)
1467 {
1468 buf[0] = 0x67452301UL;
1469 buf[1] = 0xefcdab89UL;
1470 buf[2] = 0x98badcfeUL;
1471 buf[3] = 0x10325476UL;
1472 }
1473
1474
1475
1476 /* Basic MD5 step. Transform buf based on in.
1477 */
1468 void sha256_transform(SHA256_CTX& ctx, unsigned char *data)
1469 {
1470 _ntl_uint32 a,b,c,d,e,f,g,h,i,j,t1,t2,m[64];
1471
1472 for (i=0,j=0; i < 16; ++i, j += 4)
1473 m[i] = (data[j] << 24) | (data[j+1] << 16) | (data[j+2] << 8) | (data[j+3]);
1474 for ( ; i < 64; ++i)
1475 m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
1476
1477 a = ctx.state[0];
1478 b = ctx.state[1];
1479 c = ctx.state[2];
1480 d = ctx.state[3];
1481 e = ctx.state[4];
1482 f = ctx.state[5];
1483 g = ctx.state[6];
1484 h = ctx.state[7];
1485
1486 for (i = 0; i < 64; ++i) {
1487 t1 = h + EP1(e) + CH(e,f,g) + sha256_const[i] + m[i];
1488 t2 = EP0(a) + MAJ(a,b,c);
1489 h = g;
1490 g = f;
1491 f = e;
1492 e = d + t1;
1493 d = c;
1494 c = b;
1495 b = a;
1496 a = t1 + t2;
1497 }
1498
1499 ctx.state[0] += a;
1500 ctx.state[1] += b;
1501 ctx.state[2] += c;
1502 ctx.state[3] += d;
1503 ctx.state[4] += e;
1504 ctx.state[5] += f;
1505 ctx.state[6] += g;
1506 ctx.state[7] += h;
1507 }
14781508
14791509 static
1480 void MD5_compress(unsigned long *buf, unsigned long *in)
1481 {
1482 unsigned long a = buf[0], b = buf[1], c = buf[2], d = buf[3];
1483
1484 /* Round 1 */
1485 #define S11 7
1486 #define S12 12
1487 #define S13 17
1488 #define S14 22
1489 FF ( a, b, c, d, in[ 0], S11, 3614090360UL); /* 1 */
1490 FF ( d, a, b, c, in[ 1], S12, 3905402710UL); /* 2 */
1491 FF ( c, d, a, b, in[ 2], S13, 606105819UL); /* 3 */
1492 FF ( b, c, d, a, in[ 3], S14, 3250441966UL); /* 4 */
1493 FF ( a, b, c, d, in[ 4], S11, 4118548399UL); /* 5 */
1494 FF ( d, a, b, c, in[ 5], S12, 1200080426UL); /* 6 */
1495 FF ( c, d, a, b, in[ 6], S13, 2821735955UL); /* 7 */
1496 FF ( b, c, d, a, in[ 7], S14, 4249261313UL); /* 8 */
1497 FF ( a, b, c, d, in[ 8], S11, 1770035416UL); /* 9 */
1498 FF ( d, a, b, c, in[ 9], S12, 2336552879UL); /* 10 */
1499 FF ( c, d, a, b, in[10], S13, 4294925233UL); /* 11 */
1500 FF ( b, c, d, a, in[11], S14, 2304563134UL); /* 12 */
1501 FF ( a, b, c, d, in[12], S11, 1804603682UL); /* 13 */
1502 FF ( d, a, b, c, in[13], S12, 4254626195UL); /* 14 */
1503 FF ( c, d, a, b, in[14], S13, 2792965006UL); /* 15 */
1504 FF ( b, c, d, a, in[15], S14, 1236535329UL); /* 16 */
1505
1506 /* Round 2 */
1507 #define S21 5
1508 #define S22 9
1509 #define S23 14
1510 #define S24 20
1511 GG ( a, b, c, d, in[ 1], S21, 4129170786UL); /* 17 */
1512 GG ( d, a, b, c, in[ 6], S22, 3225465664UL); /* 18 */
1513 GG ( c, d, a, b, in[11], S23, 643717713UL); /* 19 */
1514 GG ( b, c, d, a, in[ 0], S24, 3921069994UL); /* 20 */
1515 GG ( a, b, c, d, in[ 5], S21, 3593408605UL); /* 21 */
1516 GG ( d, a, b, c, in[10], S22, 38016083UL); /* 22 */
1517 GG ( c, d, a, b, in[15], S23, 3634488961UL); /* 23 */
1518 GG ( b, c, d, a, in[ 4], S24, 3889429448UL); /* 24 */
1519 GG ( a, b, c, d, in[ 9], S21, 568446438UL); /* 25 */
1520 GG ( d, a, b, c, in[14], S22, 3275163606UL); /* 26 */
1521 GG ( c, d, a, b, in[ 3], S23, 4107603335UL); /* 27 */
1522 GG ( b, c, d, a, in[ 8], S24, 1163531501UL); /* 28 */
1523 GG ( a, b, c, d, in[13], S21, 2850285829UL); /* 29 */
1524 GG ( d, a, b, c, in[ 2], S22, 4243563512UL); /* 30 */
1525 GG ( c, d, a, b, in[ 7], S23, 1735328473UL); /* 31 */
1526 GG ( b, c, d, a, in[12], S24, 2368359562UL); /* 32 */
1527
1528 /* Round 3 */
1529 #define S31 4
1530 #define S32 11
1531 #define S33 16
1532 #define S34 23
1533 HH ( a, b, c, d, in[ 5], S31, 4294588738UL); /* 33 */
1534 HH ( d, a, b, c, in[ 8], S32, 2272392833UL); /* 34 */
1535 HH ( c, d, a, b, in[11], S33, 1839030562UL); /* 35 */
1536 HH ( b, c, d, a, in[14], S34, 4259657740UL); /* 36 */
1537 HH ( a, b, c, d, in[ 1], S31, 2763975236UL); /* 37 */
1538 HH ( d, a, b, c, in[ 4], S32, 1272893353UL); /* 38 */
1539 HH ( c, d, a, b, in[ 7], S33, 4139469664UL); /* 39 */
1540 HH ( b, c, d, a, in[10], S34, 3200236656UL); /* 40 */
1541 HH ( a, b, c, d, in[13], S31, 681279174UL); /* 41 */
1542 HH ( d, a, b, c, in[ 0], S32, 3936430074UL); /* 42 */
1543 HH ( c, d, a, b, in[ 3], S33, 3572445317UL); /* 43 */
1544 HH ( b, c, d, a, in[ 6], S34, 76029189UL); /* 44 */
1545 HH ( a, b, c, d, in[ 9], S31, 3654602809UL); /* 45 */
1546 HH ( d, a, b, c, in[12], S32, 3873151461UL); /* 46 */
1547 HH ( c, d, a, b, in[15], S33, 530742520UL); /* 47 */
1548 HH ( b, c, d, a, in[ 2], S34, 3299628645UL); /* 48 */
1549
1550 /* Round 4 */
1551 #define S41 6
1552 #define S42 10
1553 #define S43 15
1554 #define S44 21
1555 II ( a, b, c, d, in[ 0], S41, 4096336452UL); /* 49 */
1556 II ( d, a, b, c, in[ 7], S42, 1126891415UL); /* 50 */
1557 II ( c, d, a, b, in[14], S43, 2878612391UL); /* 51 */
1558 II ( b, c, d, a, in[ 5], S44, 4237533241UL); /* 52 */
1559 II ( a, b, c, d, in[12], S41, 1700485571UL); /* 53 */
1560 II ( d, a, b, c, in[ 3], S42, 2399980690UL); /* 54 */
1561 II ( c, d, a, b, in[10], S43, 4293915773UL); /* 55 */
1562 II ( b, c, d, a, in[ 1], S44, 2240044497UL); /* 56 */
1563 II ( a, b, c, d, in[ 8], S41, 1873313359UL); /* 57 */
1564 II ( d, a, b, c, in[15], S42, 4264355552UL); /* 58 */
1565 II ( c, d, a, b, in[ 6], S43, 2734768916UL); /* 59 */
1566 II ( b, c, d, a, in[13], S44, 1309151649UL); /* 60 */
1567 II ( a, b, c, d, in[ 4], S41, 4149444226UL); /* 61 */
1568 II ( d, a, b, c, in[11], S42, 3174756917UL); /* 62 */
1569 II ( c, d, a, b, in[ 2], S43, 718787259UL); /* 63 */
1570 II ( b, c, d, a, in[ 9], S44, 3951481745UL); /* 64 */
1571
1572 buf[0] = TRUNC32(buf[0] + a);
1573 buf[1] = TRUNC32(buf[1] + b);
1574 buf[2] = TRUNC32(buf[2] + c);
1575 buf[3] = TRUNC32(buf[3] + d);
1576 }
1577
1578
1579 /*
1580 * END RSA's md5 stuff
1581 *
1582 */
1583
1510 void sha256_init(SHA256_CTX& ctx)
1511 {
1512 ctx.datalen = 0;
1513 ctx.bitlen[0] = 0;
1514 ctx.bitlen[1] = 0;
1515 ctx.state[0] = 0x6a09e667;
1516 ctx.state[1] = 0xbb67ae85;
1517 ctx.state[2] = 0x3c6ef372;
1518 ctx.state[3] = 0xa54ff53a;
1519 ctx.state[4] = 0x510e527f;
1520 ctx.state[5] = 0x9b05688c;
1521 ctx.state[6] = 0x1f83d9ab;
1522 ctx.state[7] = 0x5be0cd19;
1523 }
15841524
15851525 static
1586 void words_from_bytes(unsigned long *txtl, const unsigned char *txtc, long n)
1587 {
1526 void sha256_update(SHA256_CTX& ctx, const unsigned char *data, _ntl_uint32 len)
1527 {
1528 _ntl_uint32 i;
1529
1530 for (i=0; i < len; ++i) {
1531 ctx.data[ctx.datalen] = data[i];
1532 ctx.datalen++;
1533 if (ctx.datalen == 64) {
1534 sha256_transform(ctx,ctx.data);
1535 DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],512);
1536 ctx.datalen = 0;
1537 }
1538 }
1539 }
1540
1541 static
1542 void sha256_final(SHA256_CTX& ctx, unsigned char *hash,
1543 long hlen=SHA256_HASHSIZE)
1544 {
1545 _ntl_uint32 i, j;
1546
1547 i = ctx.datalen;
1548
1549 // Pad whatever data is left in the buffer.
1550 if (ctx.datalen < 56) {
1551 ctx.data[i++] = 0x80;
1552 while (i < 56)
1553 ctx.data[i++] = 0x00;
1554 }
1555 else {
1556 ctx.data[i++] = 0x80;
1557 while (i < 64)
1558 ctx.data[i++] = 0x00;
1559 sha256_transform(ctx,ctx.data);
1560 memset(ctx.data,0,56);
1561 }
1562
1563 // Append to the padding the total message's length in bits and transform.
1564 DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],ctx.datalen * 8);
1565
1566 ctx.data[63] = ctx.bitlen[0];
1567 ctx.data[62] = ctx.bitlen[0] >> 8;
1568 ctx.data[61] = ctx.bitlen[0] >> 16;
1569 ctx.data[60] = ctx.bitlen[0] >> 24;
1570 ctx.data[59] = ctx.bitlen[1];
1571 ctx.data[58] = ctx.bitlen[1] >> 8;
1572 ctx.data[57] = ctx.bitlen[1] >> 16;
1573 ctx.data[56] = ctx.bitlen[1] >> 24;
1574 sha256_transform(ctx,ctx.data);
1575
1576 for (i = 0; i < 8; i++) {
1577 _ntl_uint32 w = ctx.state[i];
1578 for (j = 0; j < 4; j++) {
1579 if (hlen <= 0) break;
1580 hash[4*i + j] = w >> (24-j*8);
1581 hlen--;
1582 }
1583 }
1584
1585 }
1586
1587
1588
1589 static
1590 void sha256(const unsigned char *data, long dlen, unsigned char *hash,
1591 long hlen=SHA256_HASHSIZE)
1592 {
1593 if (dlen < 0) dlen = 0;
1594 if (hlen < 0) hlen = 0;
1595
1596 SHA256_CTX ctx;
1597 sha256_init(ctx);
1598
1599 const long BLKSIZE = 4096;
1600
15881601 long i;
1589 unsigned long v;
1590
1591 for (i = 0; i < n; i++) {
1592 v = txtc[4*i];
1593 v += ((unsigned long) (txtc[4*i+1])) << 8;
1594 v += ((unsigned long) (txtc[4*i+2])) << 16;
1595 v += ((unsigned long) (txtc[4*i+3])) << 24;
1596 txtl[i] = v;
1597 }
1598 }
1599
1600 static
1601 void bytes_from_words(unsigned char *txtc, const unsigned long *txtl, long n)
1602 {
1602 for (i = 0; i <= dlen-BLKSIZE; i += BLKSIZE)
1603 sha256_update(ctx, data + i, BLKSIZE);
1604
1605 if (i < dlen)
1606 sha256_update(ctx, data + i, dlen - i);
1607
1608 sha256_final(ctx, hash, hlen);
1609 }
1610
1611
1612 static
1613 void hmac_sha256(const unsigned char *key, long klen,
1614 const unsigned char *data, long dlen,
1615 unsigned char *hash, long hlen=SHA256_HASHSIZE)
1616 {
1617 if (klen < 0) klen = 0;
1618 if (dlen < 0) dlen = 0;
1619 if (hlen < 0) hlen = 0;
1620
1621 unsigned char K[SHA256_BLOCKSIZE];
1622 unsigned char tmp[SHA256_HASHSIZE];
1623
16031624 long i;
1604 unsigned long v;
1605
1606 for (i = 0; i < n; i++) {
1607 v = txtl[i];
1608 txtc[4*i] = v & 255;
1609 v = v >> 8;
1610 txtc[4*i+1] = v & 255;
1611 v = v >> 8;
1612 txtc[4*i+2] = v & 255;
1613 v = v >> 8;
1614 txtc[4*i+3] = v & 255;
1615 }
1616 }
1625
1626 if (klen <= SHA256_BLOCKSIZE) {
1627 for (i = 0; i < klen; i++)
1628 K[i] = key[i];
1629 for (i = klen; i < SHA256_BLOCKSIZE; i++)
1630 K[i] = 0;
1631 }
1632 else {
1633 sha256(key, klen, K, SHA256_BLOCKSIZE);
1634 for (i = SHA256_HASHSIZE; i < SHA256_BLOCKSIZE; i++)
1635 K[i] = 0;
1636 }
1637
1638 for (i = 0; i < SHA256_BLOCKSIZE; i++)
1639 K[i] ^= 0x36;
1640
1641 SHA256_CTX ctx;
1642 sha256_init(ctx);
1643 sha256_update(ctx, K, SHA256_BLOCKSIZE);
1644 sha256_update(ctx, data, dlen);
1645 sha256_final(ctx, tmp);
1646
1647 for (i = 0; i < SHA256_BLOCKSIZE; i++)
1648 K[i] ^= (0x36 ^ 0x5C);
1649
1650 sha256_init(ctx);
1651 sha256_update(ctx, K, SHA256_BLOCKSIZE);
1652 sha256_update(ctx, tmp, SHA256_HASHSIZE);
1653 sha256_final(ctx, hash, hlen);
1654 }
1655
1656
1657 // This key derivation uses HMAC with a zero key to derive
1658 // an intermediate key K from the data, and then uses HMAC
1659 // as a PRF in counter mode with key K to derive the final key
1660
1661 void DeriveKey(unsigned char *key, long klen,
1662 const unsigned char *data, long dlen)
1663 {
1664 if (dlen < 0) LogicError("DeriveKey: bad args");
1665 if (klen < 0) LogicError("DeriveKey: bad args");
1666
1667 long i, j;
1668
1669
1670 unsigned char K[SHA256_HASHSIZE];
1671 hmac_sha256(0, 0, data, dlen, K);
1672
1673 // initialize 64-bit counter to zero
1674 unsigned char counter[8];
1675 for (j = 0; j < 8; j++) counter[j] = 0;
1676
1677 for (i = 0; i <= klen-SHA256_HASHSIZE; i += SHA256_HASHSIZE) {
1678 hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i);
1679
1680 // increment counter
1681 for (j = 0; j < 8; j++) {
1682 counter[j]++;
1683 if (counter[j] != 0) break;
1684 }
1685 }
1686
1687 if (i < klen)
1688 hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i, klen-i);
1689 }
1690
1691
1692
1693
1694 // ******************** ChaCha20 stuff ***********************
1695
1696 static const _ntl_uint32 chacha_const[4] =
1697 { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 };
1698
1699
1700 #define LE(p) (((_ntl_uint32)((p)[0])) + ((_ntl_uint32)((p)[1]) << 8) + \
1701 ((_ntl_uint32)((p)[2]) << 16) + ((_ntl_uint32)((p)[3]) << 24))
1702
1703 #define FROMLE(p, x) (p)[0] = (x), (p)[1] = ((x) >> 8), \
1704 (p)[2] = ((x) >> 16), (p)[3] = ((x) >> 24)
1705
1706
1707 #define QUARTERROUND(x, a, b, c, d) \
1708 x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 16), \
1709 x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 12), \
1710 x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 8), \
1711 x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 7)
16171712
16181713
16191714 static
1620 void MD5_compress1(unsigned long *buf, unsigned char *in, long n)
1621 {
1622 unsigned long txtl[16];
1623 unsigned char txtc[64];
1624 long i, j, k;
1625
1626 if (n < 0) n = 0;
1627
1628 i = 0;
1629 while (i < n) {
1630 k = n-i;
1631 if (k > 64) k = 64;
1632 for (j = 0; j < k; j++)
1633 txtc[j] = in[i+j];
1634 for (; j < 64; j++)
1635 txtc[j] = 0;
1636 words_from_bytes(txtl, txtc, 16);
1637 MD5_compress(buf, txtl);
1638 i += k;
1639 }
1640 }
1641
1642
1643 // the "cipherpunk" version of arc4
1644
1645 struct _ZZ_arc4_key
1646 {
1647 unsigned char state[256];
1648 unsigned char x;
1649 unsigned char y;
1650 };
1651
1715 void salsa20_core(_ntl_uint32* data)
1716 {
1717 long i;
1718
1719 for (i = 0; i < 10; i++) {
1720 QUARTERROUND(data, 0, 4, 8, 12);
1721 QUARTERROUND(data, 1, 5, 9, 13);
1722 QUARTERROUND(data, 2, 6, 10, 14);
1723 QUARTERROUND(data, 3, 7, 11, 15);
1724 QUARTERROUND(data, 0, 5, 10, 15);
1725 QUARTERROUND(data, 1, 6, 11, 12);
1726 QUARTERROUND(data, 2, 7, 8, 13);
1727 QUARTERROUND(data, 3, 4, 9, 14);
1728 }
1729 }
1730
1731
1732 // key K must be exactly 32 bytes
1733 static
1734 void salsa20_init(_ntl_uint32 *state, const unsigned char *K)
1735 {
1736 long i;
1737
1738 for (i = 0; i < 4; i++)
1739 state[i] = chacha_const[i];
1740
1741 for (i = 4; i < 12; i++)
1742 state[i] = LE(K + 4*(i-4));
1743
1744 for (i = 12; i < 16; i++)
1745 state[i] = 0;
1746 }
1747
1748
1749
1750 // state and data are of length 16
1751 static
1752 void salsa20_apply(_ntl_uint32 *state, _ntl_uint32 *data)
1753 {
1754 long i;
1755
1756 for (i = 0; i < 16; i++) data[i] = state[i];
1757
1758 salsa20_core(data);
1759
1760 for (i = 0; i < 16; i++) data[i] += state[i];
1761
1762 for (i = 12; i < 16; i++) {
1763 state[i]++;
1764 state[i] = INT32MASK(state[i]);
1765 if (state[i] != 0) break;
1766 }
1767 }
1768
1769
1770 #if 0
1771 // state is 16 words, data is 64 bytes
1772 static
1773 void salsa20_apply(_ntl_uint32 *state, unsigned char *data)
1774 {
1775 _ntl_uint32 wdata[16];
1776 salsa20_apply(state, wdata);
1777
1778 long i;
1779 for (i = 0; i < 16; i++)
1780 FROMLE(data + 4*i, wdata[i]);
1781
1782 // FIXME: could use memcpy for above if everything
1783 // is right
1784 }
1785 #endif
1786
1787
1788
1789 RandomStream::RandomStream(const unsigned char *key)
1790 {
1791 salsa20_init(state, key);
1792 pos = 64;
1793 }
1794
1795
1796 void RandomStream::do_get(unsigned char *NTL_RESTRICT res, long n)
1797 {
1798 if (n < 0) LogicError("RandomStream::get: bad args");
1799
1800 long i, j;
1801
1802 if (n <= 64-pos) {
1803 for (i = 0; i < n; i++) res[i] = buf[pos+i];
1804 pos += n;
1805 return;
1806 }
1807
1808 // read remainder of buffer
1809 for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
1810 n -= 64-pos;
1811 res += 64-pos;
1812 pos = 64;
1813
1814 _ntl_uint32 wdata[16];
1815
1816 // read 64-byte chunks
1817 for (i = 0; i <= n-64; i += 64) {
1818 salsa20_apply(state, wdata);
1819 for (j = 0; j < 16; j++)
1820 FROMLE(res + i + 4*j, wdata[j]);
1821 }
1822
1823 if (i < n) {
1824 salsa20_apply(state, wdata);
1825
1826 for (j = 0; j < 16; j++)
1827 FROMLE(buf + 4*j, wdata[j]);
1828
1829 pos = n-i;
1830 for (j = 0; j < pos; j++)
1831 res[i+j] = buf[j];
1832 }
1833 }
1834
1835
1836 NTL_TLS_GLOBAL_DECL(UniquePtr<RandomStream>, CurrentRandomStream);
1837
1838
1839 void SetSeed(const RandomStream& s)
1840 {
1841 NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
1842
1843 if (!CurrentRandomStream)
1844 CurrentRandomStream.make(s);
1845 else
1846 *CurrentRandomStream = s;
1847 }
1848
1849
1850 void SetSeed(const unsigned char *data, long dlen)
1851 {
1852 if (dlen < 0) LogicError("SetSeed: bad args");
1853
1854 Vec<unsigned char> key;
1855 key.SetLength(NTL_PRG_KEYLEN);
1856 DeriveKey(key.elts(), NTL_PRG_KEYLEN, data, dlen);
1857
1858 SetSeed(RandomStream(key.elts()));
1859 }
1860
1861 void SetSeed(const ZZ& seed)
1862 {
1863 long nb = NumBytes(seed);
1864
1865 Vec<unsigned char> buf;
1866 buf.SetLength(nb);
1867
1868 BytesFromZZ(buf.elts(), seed, nb);
1869
1870 SetSeed(buf.elts(), nb);
1871 }
1872
1873
1874 static
1875 void InitRandomStream()
1876 {
1877 const string& id = UniqueID();
1878 SetSeed((const unsigned char *) id.c_str(), id.length());
1879 }
16521880
16531881 static inline
1654 void swap_byte(unsigned char *a, unsigned char *b)
1655 {
1656 unsigned char swapByte;
1657
1658 swapByte = *a;
1659 *a = *b;
1660 *b = swapByte;
1661 }
1662
1663 static
1664 void prepare_key(unsigned char *key_data_ptr,
1665 long key_data_len, _ZZ_arc4_key *key)
1666 {
1667 unsigned char index1;
1668 unsigned char index2;
1669 unsigned char* state;
1670 long counter;
1671
1672 state = &key->state[0];
1673 for(counter = 0; counter < 256; counter++)
1674 state[counter] = counter;
1675 key->x = 0;
1676 key->y = 0;
1677 index1 = 0;
1678 index2 = 0;
1679 for(counter = 0; counter < 256; counter++)
1680 {
1681 index2 = (key_data_ptr[index1] + state[counter] + index2) & 255;
1682 swap_byte(&state[counter], &state[index2]);
1683
1684 index1 = (index1 + 1) % key_data_len;
1685 }
1686 }
1687
1688
1689
1690 static
1691 void arc4(unsigned char *buffer_ptr, long buffer_len, _ZZ_arc4_key *key)
1692 {
1693 unsigned char x;
1694 unsigned char y;
1695 unsigned char* state;
1696 unsigned char xorIndex;
1697 long counter;
1698
1699 x = key->x;
1700 y = key->y;
1701
1702 state = &key->state[0];
1703 for(counter = 0; counter < buffer_len; counter ++)
1704 {
1705 x = (x + 1) & 255;
1706 y = (state[x] + y) & 255;
1707 swap_byte(&state[x], &state[y]);
1708
1709 xorIndex = (state[x] + state[y]) & 255;
1710
1711 buffer_ptr[counter] = state[xorIndex];
1712 }
1713 key->x = x;
1714 key->y = y;
1715 }
1716
1717 // global state information for PRNG
1718
1719 NTL_THREAD_LOCAL static long ran_initialized = 0;
1720 NTL_THREAD_LOCAL static _ZZ_arc4_key ran_key;
1721
1722 static const unsigned long default_md5_tab[16] = {
1723 744663023UL, 1011602954UL, 3163087192UL, 3383838527UL,
1724 3305324122UL, 3197458079UL, 2266495600UL, 2760303563UL,
1725 346234297UL, 1919920720UL, 1896169861UL, 2192176675UL,
1726 2027150322UL, 2090160759UL, 2134858730UL, 1131796244UL
1727 };
1728
1729
1730
1731 static
1732 void build_arc4_tab(unsigned char *seed_bytes, const ZZ& s)
1733 {
1734 long nb = NumBytes(s);
1735
1736 unsigned char *txt;
1737
1738 Vec<unsigned char> txt_storage;
1739 txt_storage.SetLength(nb + 68);
1740 txt = txt_storage.elts();
1741
1742 BytesFromZZ(txt + 4, s, nb);
1743
1744 bytes_from_words(txt + nb + 4, default_md5_tab, 16);
1745
1746 unsigned long buf[4];
1747
1748 unsigned long i;
1749 for (i = 0; i < 16; i++) {
1750 MD5_default_IV(buf);
1751 bytes_from_words(txt, &i, 1);
1752
1753 MD5_compress1(buf, txt, nb + 68);
1754
1755 bytes_from_words(seed_bytes + 16*i, buf, 4);
1756 }
1757 }
1758
1759
1760 void SetSeed(const ZZ& s)
1761 {
1762 unsigned char seed_bytes[256];
1763
1764 build_arc4_tab(seed_bytes, s);
1765 prepare_key(seed_bytes, 256, &ran_key);
1766
1767 // we discard the first 1024 bytes of the arc4 stream, as this is
1768 // recommended practice.
1769
1770 arc4(seed_bytes, 256, &ran_key);
1771 arc4(seed_bytes, 256, &ran_key);
1772 arc4(seed_bytes, 256, &ran_key);
1773 arc4(seed_bytes, 256, &ran_key);
1774
1775 ran_initialized = 1;
1776 }
1777
1778
1779 static
1780 void ran_bytes(unsigned char *bytes, long n)
1781 {
1782 if (!ran_initialized) {
1783 ZZ x;
1784 const string& id = UniqueID();
1785
1786 ZZFromBytes(x, (const unsigned char *) id.c_str(), id.length());
1787 // DIRT: slightly dirty cast from char * to unsigned char *,
1788
1789 SetSeed(x);
1790 }
1791 arc4(bytes, n, &ran_key);
1882 RandomStream& LocalGetCurrentRandomStream()
1883 {
1884 NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
1885
1886 if (!CurrentRandomStream) InitRandomStream();
1887 return *CurrentRandomStream;
1888 }
1889
1890 RandomStream& GetCurrentRandomStream()
1891 {
1892 return LocalGetCurrentRandomStream();
1893 }
1894
1895
1896
1897
1898
1899
1900
1901 static inline
1902 unsigned long WordFromBytes(const unsigned char *buf, long n)
1903 {
1904 unsigned long res = 0;
1905 long i;
1906
1907 for (i = n-1; i >= 0; i--)
1908 res = (res << 8) | buf[i];
1909
1910 return res;
17921911 }
17931912
17941913
17951914 unsigned long RandomWord()
17961915 {
1916 RandomStream& stream = LocalGetCurrentRandomStream();
17971917 unsigned char buf[NTL_BITS_PER_LONG/8];
1798 long i;
1799 unsigned long res;
1800
1801 ran_bytes(buf, NTL_BITS_PER_LONG/8);
1802
1803 res = 0;
1804 for (i = NTL_BITS_PER_LONG/8 - 1; i >= 0; i--) {
1805 res = res << 8;
1806 res = res | buf[i];
1807 }
1808
1809 return res;
1918
1919 stream.get(buf, NTL_BITS_PER_LONG/8);
1920 return WordFromBytes(buf, NTL_BITS_PER_LONG/8);
18101921 }
18111922
18121923 long RandomBits_long(long l)
18151926 if (l >= NTL_BITS_PER_LONG)
18161927 ResourceError("RandomBits: length too big");
18171928
1929 RandomStream& stream = LocalGetCurrentRandomStream();
18181930 unsigned char buf[NTL_BITS_PER_LONG/8];
1819 unsigned long res;
1820 long i;
1821
18221931 long nb = (l+7)/8;
1823 ran_bytes(buf, nb);
1824
1825 res = 0;
1826 for (i = nb - 1; i >= 0; i--) {
1827 res = res << 8;
1828 res = res | buf[i];
1829 }
1830
1831 return long(res & ((1UL << l)-1UL));
1932 stream.get(buf, nb);
1933
1934 return long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
18321935 }
18331936
18341937 unsigned long RandomBits_ulong(long l)
18371940 if (l > NTL_BITS_PER_LONG)
18381941 ResourceError("RandomBits: length too big");
18391942
1943 RandomStream& stream = LocalGetCurrentRandomStream();
18401944 unsigned char buf[NTL_BITS_PER_LONG/8];
1841 unsigned long res;
1842 long i;
1843
18441945 long nb = (l+7)/8;
1845 ran_bytes(buf, nb);
1846
1847 res = 0;
1848 for (i = nb - 1; i >= 0; i--) {
1849 res = res << 8;
1850 res = res | buf[i];
1851 }
1852
1946 stream.get(buf, nb);
1947 unsigned long res = WordFromBytes(buf, nb);
18531948 if (l < NTL_BITS_PER_LONG)
18541949 res = res & ((1UL << l)-1UL);
1855
18561950 return res;
18571951 }
18581952
18631957 if (l >= NTL_BITS_PER_LONG)
18641958 ResourceError("RandomLen: length too big");
18651959
1866 return RandomBits_long(l-1) + (1L << (l-1));
1867 }
1960 RandomStream& stream = LocalGetCurrentRandomStream();
1961 unsigned char buf[NTL_BITS_PER_LONG/8];
1962 long nb = ((l-1)+7)/8;
1963 stream.get(buf, nb);
1964 unsigned long res = WordFromBytes(buf, nb);
1965 unsigned long mask = (1UL << (l-1)) - 1UL;
1966 return long((res & mask) | (mask+1UL));
1967 }
1968
1969
1970 long RandomBnd(long bnd)
1971 {
1972 if (bnd <= 1) return 0;
1973
1974 RandomStream& stream = LocalGetCurrentRandomStream();
1975 unsigned char buf[NTL_BITS_PER_LONG/8];
1976 long l = NumBits(bnd-1);
1977 long nb = (l+7)/8;
1978
1979 long tmp;
1980 do {
1981 stream.get(buf, nb);
1982 tmp = long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
1983 } while (tmp >= bnd);
1984
1985 return tmp;
1986 }
1987
18681988
18691989
18701990 void RandomBits(ZZ& x, long l)
18771997 if (NTL_OVERFLOW(l, 1, 0))
18781998 ResourceError("RandomBits: length too big");
18791999
2000 RandomStream& stream = LocalGetCurrentRandomStream();
2001
18802002 long nb = (l+7)/8;
1881
1882 NTL_THREAD_LOCAL static Vec<unsigned char> buf_mem;
2003 unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
2004
2005 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
18832006 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
18842007
18852008 buf_mem.SetLength(nb);
18862009 unsigned char *buf = buf_mem.elts();
18872010
1888 ran_bytes(buf, nb);
1889
1890 NTL_ZZRegister(res);
1891
1892 ZZFromBytes(res, buf, nb);
1893 trunc(res, res, l);
1894
1895 x = res;
2011 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
2012 // pre-allocate to ensure strong ES
2013
2014 stream.get(buf, nb);
2015 buf[nb-1] &= mask;
2016
2017 ZZFromBytes(x, buf, nb);
18962018 }
18972019
18982020
19112033 if (NTL_OVERFLOW(l, 1, 0))
19122034 ResourceError("RandomLen: length too big");
19132035
1914 // pre-allocate space to avoid two allocations
1915 long nw = (l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS;
1916 x.SetSize(nw);
1917
1918 RandomBits(x, l-1);
1919 SetBit(x, l-1);
1920 }
1921
1922
1923 const long RandomBndExcess = 8;
1924
2036 RandomStream& stream = LocalGetCurrentRandomStream();
2037
2038 long nb = (l+7)/8;
2039 unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
2040
2041 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
2042 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
2043
2044 buf_mem.SetLength(nb);
2045 unsigned char *buf = buf_mem.elts();
2046
2047 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
2048 // pre-allocate to ensure strong ES
2049
2050 stream.get(buf, nb);
2051 buf[nb-1] &= mask;
2052 buf[nb-1] |= ((mask >> 1) + 1UL);
2053
2054 ZZFromBytes(x, buf, nb);
2055 }
2056
2057
2058
2059
2060
2061 /**********************************************************
2062
2063 The following implementation of RandomBnd is designed
2064 for speed. It certainly is not resilient against a
2065 timing side-channel attack (but then again, none of these
2066 PRG routines are designed to be).
2067
2068 The naive strategy generates random candidates of the right
2069 bit length until the candidate < bnd.
2070 The idea in this implementation is to generate the high
2071 order two bytes of the candidate first, and compare this
2072 to the high order two bytes of tmp. We can discard the
2073 candidate if this is already too large.
2074
2075 ***********************************************************/
19252076
19262077 void RandomBnd(ZZ& x, const ZZ& bnd)
19272078 {
19302081 return;
19312082 }
19322083
1933 long k = NumBits(bnd);
1934
1935 if (weight(bnd) == 1) {
1936 RandomBits(x, k-1);
1937 return;
1938 }
1939
1940 long l = k + RandomBndExcess;
1941
1942 NTL_ZZRegister(t);
1943 NTL_ZZRegister(r);
1944 NTL_ZZRegister(t1);
1945
1946 do {
1947 RandomBits(t, l);
1948 rem(r, t, bnd);
1949 sub(t1, bnd, r);
1950 add(t, t, t1);
1951 } while (NumBits(t) > l);
1952
1953 x = r;
1954 }
1955
1956 long RandomBnd(long bnd)
1957 {
1958 if (bnd <= 1) return 0;
1959
1960 long k = NumBits(bnd);
1961
1962 if (((bnd - 1) & bnd) == 0)
1963 return RandomBits_long(k-1);
1964
1965 long l = k + RandomBndExcess;
1966
1967 if (l > NTL_BITS_PER_LONG-2) {
1968 NTL_ZZRegister(Bnd);
1969 NTL_ZZRegister(res);
1970
1971 Bnd = bnd;
1972 RandomBnd(res, Bnd);
1973 return to_long(res);
1974 }
1975
1976 long t, r;
1977
1978 do {
1979 t = RandomBits_long(l);
1980 r = t % bnd;
1981 } while (t + bnd - r > (1L << l));
1982
1983 return r;
2084 RandomStream& stream = LocalGetCurrentRandomStream();
2085
2086 long l = NumBits(bnd);
2087 long nb = (l+7)/8;
2088
2089 if (nb <= 3) {
2090 long lbnd = conv<long>(bnd);
2091 unsigned char lbuf[3];
2092 long ltmp;
2093
2094 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
2095 // pre-allocate to ensure strong ES
2096 do {
2097 stream.get(lbuf, nb);
2098 ltmp = long(WordFromBytes(lbuf, nb) & ((1UL << l)-1UL));
2099 } while (ltmp >= lbnd);
2100
2101 conv(x, ltmp);
2102 return;
2103 }
2104
2105 // deal with possible alias
2106 NTL_ZZRegister(tmp_store);
2107 const ZZ& bnd_ref = ((&x == &bnd) ? (tmp_store = bnd) : bnd);
2108
2109
2110 NTL_ZZRegister(hbnd);
2111 RightShift(hbnd, bnd_ref, (nb-2)*8);
2112 long lhbnd = conv<long>(hbnd);
2113
2114 unsigned long mask = (1UL << (16 - nb*8 + l)) - 1UL;
2115
2116 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
2117 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
2118 buf_mem.SetLength(nb);
2119 unsigned char *buf = buf_mem.elts();
2120
2121 unsigned char hbuf[2];
2122
2123 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
2124 // pre-allocate to ensure strong ES
2125 for (;;) {
2126 stream.get(hbuf, 2);
2127 long hpart = long(WordFromBytes(hbuf, 2) & mask);
2128
2129 if (hpart > lhbnd) continue;
2130
2131 stream.get(buf, nb-2);
2132 buf[nb-2] = ((unsigned long) hpart);
2133 buf[nb-1] = ((unsigned long) hpart) >> 8;
2134
2135 ZZFromBytes(x, buf, nb);
2136 if (hpart < lhbnd || x < bnd_ref) break;
2137 }
19842138 }
19852139
19862140
19912145 static
19922146 double Log2(double x)
19932147 {
1994 NTL_THREAD_LOCAL static double log2 = log(2.0);
2148 static const double log2 = log(2.0); // GLOBAL (relies on C++11 thread-safe init)
19952149 return log(x)/log2;
19962150 }
19972151
88
99 const ZZX& ZZX::zero()
1010 {
11 NTL_THREAD_LOCAL static ZZX z;
11 static const ZZX z; // GLOBAL (relies on C++11 thread-safe init)
1212 return z;
1313 }
1414
820820 }
821821
822822
823 if (maxa + maxb >= 30 &&
824 SSRatio(deg(a), MaxBits(a), deg(b), MaxBits(b)) < 1.25) {
823 double rat = SSRatio(deg(a), MaxBits(a), deg(b), MaxBits(b));
824 long k1 = (maxa + maxb)/2;
825
826 if (
827
828 (k1 >= 26 && rat < 1.40) ||
829 (k1 >= 53 && rat < 1.60) ||
830 (k1 >= 106 && rat < 1.80) ||
831 (k1 >= 212 && rat < 2.00)
832
833 ) {
825834 SSMul(c, a, b);
826835 }
827836 else {
10091018 }
10101019
10111020 long mba = MaxBits(a);
1012
1013 if (2*maxa >= 30 &&
1014 SSRatio(deg(a), mba, deg(a), mba) < 1.25)
1021 double rat = SSRatio(deg(a), mba, deg(a), mba);
1022 long k1 = maxa;
1023
1024 if (
1025
1026 (k1 >= 26 && rat < 1.40) ||
1027 (k1 >= 53 && rat < 1.60) ||
1028 (k1 >= 106 && rat < 1.80) ||
1029 (k1 >= 212 && rat < 2.00)
1030
1031 ) {
10151032 SSSqr(c, a);
1016 else
1033 }
1034 else {
10171035 HomSqr(c, a);
1036 }
10181037 }
10191038
10201039
1010
1111 NTL_START_IMPL
1212
13 NTL_THREAD_LOCAL long ZZXFac_van_Hoeij = 1;
14
15 NTL_THREAD_LOCAL static long ok_to_abandon = 0;
13 NTL_CHEAP_THREAD_LOCAL long ZZXFac_van_Hoeij = 1;
14
15 static NTL_CHEAP_THREAD_LOCAL long ok_to_abandon = 0;
1616
1717 struct LocalInfoT {
1818 long n;
463463 f.normalize();
464464 }
465465
466 NTL_THREAD_LOCAL long ZZXFac_InitNumPrimes = 7;
467 NTL_THREAD_LOCAL long ZZXFac_MaxNumPrimes = 50;
466 NTL_CHEAP_THREAD_LOCAL long ZZXFac_InitNumPrimes = 7;
467 NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxNumPrimes = 50;
468468
469469 static
470470 void RecordPattern(vec_long& pat, vec_pair_zz_pX_long& fac)
846846 const vec_ZZ_pX& W, const vec_ZZX& factors,
847847 const ZZX& f, long k, long verbose)
848848 {
849 NTL_THREAD_LOCAL static long cnt = 0;
849 static NTL_CHEAP_THREAD_LOCAL long cnt = 0;
850850
851851 if (verbose) {
852852 cnt = (cnt + 1) % 100;
16101610 }
16111611
16121612
1613 NTL_THREAD_LOCAL long ZZXFac_MaxPrune = 10;
1613 NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxPrune = 10;
16141614
16151615
16161616
36523652 }
36533653 }
36543654
3655 NTL_THREAD_LOCAL long ZZXFac_PowerHack = 1;
3655 NTL_CHEAP_THREAD_LOCAL long ZZXFac_PowerHack = 1;
36563656
36573657 void SFFactor(vec_ZZX& factors, const ZZX& ff,
36583658 long verbose,
99
1010
1111
12 NTL_THREAD_LOCAL SmartPtr<ZZ_pInfoT> ZZ_pInfo = 0;
13 NTL_THREAD_LOCAL SmartPtr<ZZ_pTmpSpaceT> ZZ_pTmpSpace = 0;
14 NTL_THREAD_LOCAL bool ZZ_pInstalled = false;
12 NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pInfoT>, ZZ_pInfo_stg)
13 NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pTmpSpaceT>, ZZ_pTmpSpace_stg)
14
15 NTL_CHEAP_THREAD_LOCAL ZZ_pInfoT *ZZ_pInfo = 0;
16 NTL_CHEAP_THREAD_LOCAL ZZ_pTmpSpaceT *ZZ_pTmpSpace = 0;
17 NTL_CHEAP_THREAD_LOCAL bool ZZ_pInstalled = false;
1518
1619
1720
9194
9295 double fn = double(n);
9396
97 // NOTE: these next two range checks are somewhat academic,
98 // but various bits of code in the ZZ_pX implementation
99 // implicitly rely on them
100
94101 if (8.0*fn*(fn+48) > NTL_FDOUBLE_PRECISION)
95102 ResourceError("modulus too big");
96103
97
98 if (8.0*fn*(fn+48) <= NTL_FDOUBLE_PRECISION/double(NTL_SP_BOUND))
99 FFTInfo->QuickCRT = true;
100 else
101 FFTInfo->QuickCRT = false;
102
103 // FIXME: some of this stuff does not need to be initialized
104 // at all if FFTInfo->crt_struct.special()
105
106 FFTInfo->x.SetLength(n);
107 FFTInfo->u.SetLength(n);
108 FFTInfo->uqinv.SetLength(n);
104 if (n >= NTL_SP_BOUND)
105 ResourceError("modulus too big");
106
107
109108
110109 FFTInfo->rem_struct.init(n, ZZ_pInfo->p, GetFFTPrime);
111
112110 FFTInfo->crt_struct.init(n, ZZ_pInfo->p, GetFFTPrime);
113111
114112 if (!FFTInfo->crt_struct.special()) {
113 FFTInfo->prime.SetLength(n);
114 FFTInfo->prime_recip.SetLength(n);
115 FFTInfo->u.SetLength(n);
116 FFTInfo->uqinv.SetLength(n);
117
118 // montgomery
119 FFTInfo->reduce_struct.init(ZZ_pInfo->p, ZZ(n) << NTL_SP_NBITS);
120
115121 ZZ qq, rr;
116122
117123 DivRem(qq, rr, M, ZZ_pInfo->p);
118124
119125 NegateMod(FFTInfo->MinusMModP, rr, ZZ_pInfo->p);
126
127 // montgomery
128 FFTInfo->reduce_struct.adjust(FFTInfo->MinusMModP);
120129
121130 for (i = 0; i < n; i++) {
122131 q = GetFFTPrime(i);
133142 t = rem(M1, q);
134143 t = InvMod(t, q);
135144
136 mul(M3, M2, t);
137 rem(M3, M3, ZZ_pInfo->p);
138
139 FFTInfo->crt_struct.insert(i, M3);
140
141
142 FFTInfo->x[i] = ((double) t)/((double) q);
145 // montgomery
146 FFTInfo->reduce_struct.adjust(M2);
147
148 FFTInfo->crt_struct.insert(i, M2);
149
150 FFTInfo->prime[i] = q;
151 FFTInfo->prime_recip[i] = 1/double(q);
143152 FFTInfo->u[i] = t;
144153 FFTInfo->uqinv[i] = PrepMulModPrecon(FFTInfo->u[i], q, qinv);
145154 }
155
146156 }
147157
148158 tmps = MakeSmart<ZZ_pTmpSpaceT>();
159169 tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct);
160170 }
161171
162 ZZ_pTmpSpace = tmps;
172 NTL_TLS_GLOBAL_ACCESS(ZZ_pTmpSpace_stg);
173 ZZ_pTmpSpace_stg = tmps;
174 ZZ_pTmpSpace = ZZ_pTmpSpace_stg.get();
163175 }
164176
165177
172184 }
173185
174186
187 void ZZ_pContext::save()
188 {
189 NTL_TLS_GLOBAL_ACCESS(ZZ_pInfo_stg);
190 ptr = ZZ_pInfo_stg;
191 }
192
175193
176194 void ZZ_pContext::restore() const
177195 {
178 ZZ_pInfo = ptr;
196 if (ZZ_pInfo == ptr.get()) return;
197 // NOTE: this simple optimization could be useful in some situations,
198 // for example, a worker thread re-setting the current modulus
199 // in a multi-threaded build
200
201 NTL_TLS_GLOBAL_ACCESS(ZZ_pInfo_stg);
202 ZZ_pInfo_stg = ptr;
203 ZZ_pInfo = ZZ_pInfo_stg.get();
204
205 NTL_TLS_GLOBAL_ACCESS(ZZ_pTmpSpace_stg);
206 ZZ_pTmpSpace_stg = 0;
179207 ZZ_pTmpSpace = 0;
208
180209 ZZ_pInstalled = false;
181210 }
182211
203232
204233 const ZZ_p& ZZ_p::zero()
205234 {
206 NTL_THREAD_LOCAL static ZZ_p z(INIT_NO_ALLOC);
235 static const ZZ_p z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
207236 return z;
208237 }
209238
210 NTL_THREAD_LOCAL
239 NTL_CHEAP_THREAD_LOCAL
211240 ZZ_p::DivHandlerPtr ZZ_p::DivHandler = 0;
212241
213242
44 #include <NTL/new.h>
55
66 NTL_START_IMPL
7
8
9 NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pEInfoT>, ZZ_pEInfo_stg)
10
11 NTL_CHEAP_THREAD_LOCAL
12 ZZ_pEInfoT *ZZ_pEInfo = 0;
13
714
815 ZZ_pEInfoT::ZZ_pEInfoT(const ZZ_pX& NewP)
916 {
3239
3340
3441
35
36 NTL_THREAD_LOCAL
37 SmartPtr<ZZ_pEInfoT> ZZ_pEInfo = 0;
38
39
4042 void ZZ_pE::init(const ZZ_pX& p)
4143 {
4244 ZZ_pEContext c(p);
4648
4749 void ZZ_pEContext::save()
4850 {
49 ptr = ZZ_pEInfo;
51 NTL_TLS_GLOBAL_ACCESS(ZZ_pEInfo_stg);
52 ptr = ZZ_pEInfo_stg;
5053 }
5154
5255 void ZZ_pEContext::restore() const
5356 {
54 ZZ_pEInfo = ptr;
57 NTL_TLS_GLOBAL_ACCESS(ZZ_pEInfo_stg);
58 ZZ_pEInfo_stg = ptr;
59 ZZ_pEInfo = ZZ_pEInfo_stg.get();
5560 }
5661
5762
7681
7782 const ZZ_pE& ZZ_pE::zero()
7883 {
79 static ZZ_pE z(INIT_NO_ALLOC);
84 static const ZZ_pE z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
8085 return z;
8186 }
8287
1111
1212 const ZZ_pEX& ZZ_pEX::zero()
1313 {
14 NTL_THREAD_LOCAL static ZZ_pEX z;
14 static const ZZ_pEX z; // GLOBAL (assumes C++11 thread-safe init)
1515 return z;
1616 }
1717
22082208 MulMod(A.H[i], A.H[i-1], h, F);
22092209 }
22102210
2211 NTL_THREAD_LOCAL long ZZ_pEXArgBound = 0;
2211 NTL_CHEAP_THREAD_LOCAL long ZZ_pEXArgBound = 0;
22122212
22132213
22142214
357357 }
358358
359359
360 NTL_THREAD_LOCAL long ZZ_pEX_BlockingFactor = 10;
360 NTL_CHEAP_THREAD_LOCAL long ZZ_pEX_BlockingFactor = 10;
361361
362362
363363
10611061
10621062 /************* NEW DDF ****************/
10631063
1064 NTL_THREAD_LOCAL long ZZ_pEX_GCDTableSize = 4;
1065 NTL_THREAD_LOCAL double ZZ_pEXFileThresh = NTL_FILE_THRESH;
1066 NTL_THREAD_LOCAL static vec_ZZ_pEX *BabyStepFile=0;
1067 NTL_THREAD_LOCAL static vec_ZZ_pEX *GiantStepFile=0;
1068 NTL_THREAD_LOCAL static long use_files;
1064 NTL_CHEAP_THREAD_LOCAL long ZZ_pEX_GCDTableSize = 4;
1065 NTL_CHEAP_THREAD_LOCAL double ZZ_pEXFileThresh = NTL_FILE_THRESH;
1066 static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pEX *BabyStepFile=0;
1067 static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pEX *GiantStepFile=0;
1068 static NTL_CHEAP_THREAD_LOCAL long use_files;
10691069
10701070
10711071 static
00 #include <NTL/ZZ_pX.h>
1 #include <NTL/BasicThreadPool.h>
2 #include <NTL/new.h>
13
24
35 // The mul & sqr routines use routines from ZZX,
1113
1214 #endif
1315
14 #include <NTL/new.h>
1516
1617
1718 #if (defined(NTL_GMP_LIP))
2930
3031 const ZZ_pX& ZZ_pX::zero()
3132 {
32 NTL_THREAD_LOCAL static ZZ_pX z;
33 static const ZZ_pX z; // GLOBAL (relies on C++11 thread-safe init)
3334 return z;
3435 }
3536
434435
435436 #ifndef NTL_WIZARD_HACK
436437
437 // These crossovers are tuned for a Pentium, but hopefully
438 // they should be OK on other machines as well.
439
440
441 const long SS_kbound = 40;
442 const double SS_rbound = 1.25;
443
444438
445439 void mul(ZZ_pX& c, const ZZ_pX& a, const ZZ_pX& b)
446440 {
472466 else {
473467 long mbits;
474468 mbits = NumBits(ZZ_p::modulus());
475 if (k >= SS_kbound &&
476 SSRatio(deg(a), mbits, deg(b), mbits) < SS_rbound) {
469
470 long nt = 1;
471 // FIXME: needs to be updated when I thread-enable the SS
472 // mul routine
473
474 #ifdef NTL_THREAD_BOOST
475 BasicThreadPool *pool = GetThreadPool();
476 if (pool && !pool->active()) nt = pool->NumThreads();
477 #endif
478
479 double rat = SSRatio(deg(a), mbits, deg(b), mbits);
480
481 if ( nt == 1 && (
482
483 (k >= 53 && rat < 1.10) ||
484 (k >= 106 && rat < 1.30) ||
485 (k >= 212 && rat < 1.75)
486
487 )) {
477488 ZZX A, B, C;
478489 conv(A, a);
479490 conv(B, b);
512523 else {
513524 long mbits;
514525 mbits = NumBits(ZZ_p::modulus());
515 if (k >= SS_kbound &&
516 SSRatio(deg(a), mbits, deg(a), mbits) < SS_rbound) {
526
527
528 long nt = 1;
529 // FIXME: needs to be updated when I thread-enable the SS
530 // mul routine
531
532 #ifdef NTL_THREAD_BOOST
533 BasicThreadPool *pool = GetThreadPool();
534 if (pool && !pool->active()) nt = pool->NumThreads();
535 #endif
536
537 double rat = SSRatio(deg(a), mbits, deg(a), mbits);
538
539 if ( nt == 1 && (
540
541 (k >= 53 && rat < 1.10) ||
542 (k >= 106 && rat < 1.30) ||
543 (k >= 212 && rat < 1.75)
544
545 )) {
517546 ZZX A, C;
518547 conv(A, a);
519548 SSSqr(C, A);
9831012 r.normalize();
9841013 }
9851014
1015
1016
1017 NTL_TBDECL_static(MulAux)(ZZ_p* xp, const ZZ_p* ap, const ZZ_p& t, long n)
1018 {
1019 for (long i = 0; i < n; i++)
1020 mul(xp[i], ap[i], t);
1021 }
1022
1023 #ifdef NTL_THREAD_BOOST
1024 static void MulAux(ZZ_p* xp, const ZZ_p* ap, const ZZ_p& t, long n)
1025 {
1026 BasicThreadPool *pool = GetThreadPool();
1027
1028 if (!pool || pool->active() || pool->NumThreads() == 1) {
1029 basic_MulAux(xp, ap, t, n);
1030 return;
1031 }
1032
1033 ZZ_pContext local_context;
1034 local_context.save();
1035
1036 pool->exec_range(n,
1037 [xp, ap, &t, &local_context](long first, long last) {
1038 local_context.restore();
1039 for (long i = first; i < last; i++)
1040 mul(xp[i], ap[i], t);
1041 } );
1042 }
1043 #endif
1044
1045
1046
9861047 void mul(ZZ_pX& x, const ZZ_pX& a, const ZZ_p& b)
9871048 {
9881049 if (IsZero(b)) {
9971058
9981059 NTL_ZZ_pRegister(t);
9991060
1000 long i, da;
1061 long da;
10011062
10021063 const ZZ_p *ap;
10031064 ZZ_p* xp;
1004
10051065
10061066 t = b;
10071067
10101070 ap = a.rep.elts();
10111071 xp = x.rep.elts();
10121072
1013 for (i = 0; i <= da; i++)
1014 mul(xp[i], ap[i], t);
1073 MulAux(xp, ap, t, da+1);
10151074
10161075 x.normalize();
10171076 }
11671226 }
11681227
11691228
1229 NTL_TBDECL_static(MulByXModAux1)(long n, ZZ_p *hh, const ZZ_p* aa, const ZZ_p *ff, const ZZ_p& z)
1230 {
1231 NTL_ZZ_pRegister(t);
1232
1233 for (long i = n-1; i >= 1; i--) {
1234 // hh[i] = aa[i-1] + z*ff[i]
1235 mul(t, z, ff[i]);
1236 add(hh[i], aa[i-1], t);
1237 }
1238 }
1239
1240 #ifdef NTL_THREAD_BOOST
1241
1242 static void MulByXModAux1(long n, ZZ_p *hh, const ZZ_p* aa, const ZZ_p *ff, const ZZ_p& z)
1243 {
1244
1245 BasicThreadPool *pool = GetThreadPool();
1246
1247 if (!pool || pool->active() || pool->NumThreads() == 1 || hh == aa) {
1248 // Careful! can't parallelize if hh == aa
1249 basic_MulByXModAux1(n, hh, aa, ff, z);
1250 return;
1251 }
1252
1253 ZZ_pContext local_context;
1254 local_context.save();
1255
1256 pool->exec_range(n-1,
1257 [n, hh, aa, ff, &z, &local_context]
1258 (long first, long last) {
1259 local_context.restore();
1260 NTL_ZZ_pRegister(t);
1261
1262 for (long idx = first; idx < last; idx++) {
1263 long i = n-1-idx;
1264 // hh[i] = aa[i-1] + z*ff[i]
1265 mul(t, z, ff[i]);
1266 add(hh[i], aa[i-1], t);
1267 }
1268 } );
1269 }
1270
1271
1272 #endif
11701273
11711274
11721275 static
11761279 ZZ_p* hh;
11771280 const ZZ_p *aa, *ff;
11781281
1179 ZZ_p t, z;
1282 NTL_ZZ_pRegister(z);
11801283
11811284 n = deg(f);
11821285 m = deg(a);
12041307 negate(z, aa[n-1]);
12051308 if (!IsOne(ff[n]))
12061309 div(z, z, ff[n]);
1207 for (i = n-1; i >= 1; i--) {
1208 mul(t, z, ff[i]);
1209 add(hh[i], aa[i-1], t);
1210 }
1310
1311 MulByXModAux1(n, hh, aa, ff, z);
1312
12111313 mul(hh[0], z, ff[0]);
12121314 h.normalize();
12131315 }
13231425
13241426
13251427
1326 NTL_THREAD_LOCAL static vec_long ModularRepBuf;
13271428 // FIXME: maybe I could put this is scratch space associated
13281429 // with the current modulus
1430 static inline
1431 vec_long& ModularRepBuf()
1432 {
1433 NTL_TLS_LOCAL(vec_long, t);
1434 return t;
1435 }
13291436
13301437
13311438 void ToModularRep(vec_long& x, const ZZ_p& a, const ZZ_pFFTInfoT *FFTInfo,
13351442 }
13361443
13371444
1338 // NOTE: earlier versions used Kahan summation...
1339 // we no longer do this, as it is less portable than I thought.
1340
1341 void FromModularRep(ZZ_p& x, const vec_long& a, const ZZ_pFFTInfoT *FFTInfo,
1445 void FromModularRep(ZZ_p& x, vec_long& avec, const ZZ_pFFTInfoT *FFTInfo,
13421446 ZZ_pTmpSpaceT *TmpSpace)
1343 {
1344 long n = FFTInfo->NumPrimes;
1345 NTL_ZZRegister(q);
1346 NTL_ZZRegister(s);
1447 // NOTE: a gets destroyed
1448
1449 {
13471450 NTL_ZZRegister(t);
1348 long i;
1349 double y;
1451 long * NTL_RESTRICT a = avec.elts();
13501452
13511453 if (FFTInfo->crt_struct.special()) {
1352 FFTInfo->crt_struct.eval(t, &a[0], TmpSpace->crt_tmp_vec);
1454 FFTInfo->crt_struct.eval(t, a, TmpSpace->crt_tmp_vec);
13531455 x.LoopHole() = t;
13541456 return;
13551457 }
1458
1459 long nprimes = FFTInfo->NumPrimes;
1460 const long *u = FFTInfo->u.elts();
1461 const long *prime = FFTInfo->prime.elts();
1462 const mulmod_precon_t *uqinv = FFTInfo->uqinv.elts();
1463 const double *prime_recip = FFTInfo->prime_recip.elts();
13561464
1357
1358 if (FFTInfo->QuickCRT) {
1359 y = double(0L);
1360 for (i = 0; i < n; i++)
1361 y += ((double) a[i])*FFTInfo->x[i];
1362
1363 conv(q, (y + 0.5));
1364 }
1465 double y = 0.0;
1466
1467 for (long i = 0; i < nprimes; i++) {
1468 long r = MulModPrecon(a[i], u[i], prime[i], uqinv[i]);
1469 a[i] = r;
1470 y += double(r)*prime_recip[i];
1471 }
1472
1473 long q = long(y + 0.5);
1474
1475 FFTInfo->crt_struct.eval(t, a, TmpSpace->crt_tmp_vec);
1476
1477 MulAddTo(t, FFTInfo->MinusMModP, q);
1478 // TODO: this MulAddTo could be folded into the above
1479 // crt_struct.eval as just another product to accumulate...
1480 // but, savings would be marginal and a number of interfaces
1481 // would have to be modified...
1482
1483 // montgomery
1484 FFTInfo->reduce_struct.eval(x.LoopHole(), t);
1485 }
1486
1487
1488
1489
1490
1491 NTL_TBDECL(ToFFTRep)(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
1492 // computes an n = 2^k point convolution.
1493 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1494 {
1495 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1496 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1497
1498
1499 long n, i, j, m, j1;
1500 vec_long& t = ModularRepBuf();
1501
1502
1503 if (k > FFTInfo->MaxRoot)
1504 ResourceError("Polynomial too big for FFT");
1505
1506 if (lo < 0)
1507 LogicError("bad arg to ToFFTRep");
1508
1509 long nprimes = FFTInfo->NumPrimes;
1510 t.SetLength(nprimes);
1511
1512 hi = min(hi, deg(x));
1513
1514 y.SetSize(k);
1515
1516 n = 1L << k;
1517
1518 m = max(hi-lo + 1, 0);
1519
1520 const ZZ_p *xx = x.rep.elts();
1521
1522 if (n >= m) {
1523 for (j = 0; j < m; j++) {
1524 ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
1525 for (i = 0; i < nprimes; i++) {
1526 y.tbl[i][j] = t[i];
1527 }
1528 }
1529
1530 if (n > m) {
1531 for (i = 0; i < nprimes; i++) {
1532 long *yp = &y.tbl[i][0];
1533 for (j = m; j < n; j++) {
1534 yp[j] = 0;
1535 }
1536 }
1537 }
1538 }
13651539 else {
1366 long Q, r;
1367 long qq;
1368
1369 y = double(0L);
1370
1371 clear(q);
1372
1373 for (i = 0; i < n; i++) {
1374 r = MulModPreconWithQuo(Q, a[i], FFTInfo->u[i], GetFFTPrime(i), FFTInfo->uqinv[i]);
1375 // FIXME: add to documented interface?
1376
1377 add(q, q, Q);
1378 y += double(r)*GetFFTPrimeRecip(i);
1379 }
1380
1381 qq = long(y + 0.5);
1382 add(q, q, qq);
1383 }
1384
1385 FFTInfo->crt_struct.eval(t, &a[0], TmpSpace->crt_tmp_vec);
1386
1387 mul(s, q, FFTInfo->MinusMModP);
1388 add(t, t, s);
1389
1390 conv(x, t);
1391 }
1392
1393
1394
1540 NTL_ZZ_pRegister(accum);
1541 for (j = 0; j < n; j++) {
1542 accum = xx[j+lo];
1543 for (j1 = j + n; j1 < m; j1 += n)
1544 add(accum, accum, xx[j1+lo]);
1545 ToModularRep(t, accum, FFTInfo, TmpSpace);
1546 for (i = 0; i < nprimes; i++) {
1547 y.tbl[i][j] = t[i];
1548 }
1549 }
1550 }
1551
1552 // FIXME: something to think about...part of the above logic
1553 // is essentially a matrix transpose, which could lead to bad
1554 // cache performance. I don't really know if that is an issue.
1555
1556 for (i = 0; i < nprimes; i++) {
1557 long *yp = &y.tbl[i][0];
1558 FFTFwd(yp, yp, k, i);
1559 }
1560 }
1561
1562
1563 #ifdef NTL_THREAD_BOOST
13951564
13961565 void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
13971566 // computes an n = 2^k point convolution.
13981567 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
13991568 {
1569 BasicThreadPool *pool = GetThreadPool();
1570
1571 if (!pool || pool->active() || pool->NumThreads() == 1) {
1572 basic_ToFFTRep(y, x, k, lo, hi);
1573 return;
1574 }
1575
1576
1577
1578 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1579
1580 long n, m;
1581
1582
1583 if (k > FFTInfo->MaxRoot)
1584 ResourceError("Polynomial too big for FFT");
1585
1586 if (lo < 0)
1587 LogicError("bad arg to ToFFTRep");
1588
1589 long nprimes = FFTInfo->NumPrimes;
1590
1591 hi = min(hi, deg(x));
1592
1593 y.SetSize(k);
1594
1595 n = 1L << k;
1596
1597 m = max(hi-lo + 1, 0);
1598
1599 const ZZ_p *xx = x.rep.elts();
1600
1601
1602 ZZ_pContext local_context;
1603 local_context.save();
1604
1605 if (n >= m) {
1606 pool->exec_range(m,
1607 [lo, xx, &y, nprimes, &local_context, FFTInfo]
1608 (long first, long last) {
1609
1610 local_context.restore();
1611 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1612 // TmpSpace is thread local!
1613
1614 vec_long& t = ModularRepBuf();
1615 t.SetLength(nprimes);
1616
1617 for (long j = first; j < last; j++) {
1618 ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
1619 for (long i = 0; i < nprimes; i++) {
1620 y.tbl[i][j] = t[i];
1621 }
1622 }
1623 } );
1624 }
1625 else {
1626 pool->exec_range(n,
1627 [lo, m, n, xx, &y, nprimes, &local_context, FFTInfo]
1628 (long first, long last) {
1629 local_context.restore();
1630 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1631 // TmpSpace is thread local!
1632
1633 vec_long& t = ModularRepBuf();
1634 t.SetLength(nprimes);
1635
1636 NTL_ZZ_pRegister(accum);
1637 for (long j = first; j < last; j++) {
1638 accum = xx[j+lo];
1639 for (long j1 = j + n; j1 < m; j1 += n)
1640 add(accum, accum, xx[j1+lo]);
1641 ToModularRep(t, accum, FFTInfo, TmpSpace);
1642 for (long i = 0; i < nprimes; i++) {
1643 y.tbl[i][j] = t[i];
1644 }
1645 }
1646 } );
1647 }
1648
1649 // FIXME: something to think about...part of the above logic
1650 // is essentially a matrix transpose, which could lead to bad
1651 // cache performance. I don't really know if that is an issue.
1652
1653 pool->exec_range(nprimes,
1654 [&y, m, n, k](long first, long last) {
1655 for (long i = first; i < last; i++) {
1656 long *yp = &y.tbl[i][0];
1657 for (long j = m; j < n; j++) yp[j] = 0;
1658 FFTFwd(yp, yp, k, i);
1659 }
1660 } );
1661 }
1662
1663 #endif
1664
1665
1666
1667 NTL_TBDECL(RevToFFTRep)(FFTRep& y, const vec_ZZ_p& x,
1668 long k, long lo, long hi, long offset)
1669 // computes an n = 2^k point convolution of X^offset*x[lo..hi] mod X^n-1
1670 // using "inverted" evaluation points.
1671
1672 {
14001673 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
14011674 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1402
1675
14031676
14041677 long n, i, j, m, j1;
1405 vec_long& t = ModularRepBuf;
1406 ZZ_p accum;
1407
1678 vec_long& t = ModularRepBuf();
1679 NTL_ZZ_pRegister(accum);
14081680
14091681 if (k > FFTInfo->MaxRoot)
14101682 ResourceError("Polynomial too big for FFT");
14121684 if (lo < 0)
14131685 LogicError("bad arg to ToFFTRep");
14141686
1415 t.SetLength(FFTInfo->NumPrimes);
1416
1417 hi = min(hi, deg(x));
1687 long nprimes = FFTInfo->NumPrimes;
1688 t.SetLength(nprimes);
1689
1690 hi = min(hi, x.length()-1);
14181691
14191692 y.SetSize(k);
14201693
14221695
14231696 m = max(hi-lo + 1, 0);
14241697
1425 const ZZ_p *xx = x.rep.elts();
1698 const ZZ_p *xx = x.elts();
1699
1700 offset = offset & (n-1);
14261701
14271702 for (j = 0; j < n; j++) {
14281703 if (j >= m) {
1429 for (i = 0; i < FFTInfo->NumPrimes; i++)
1430 y.tbl[i][j] = 0;
1704 for (i = 0; i < nprimes; i++)
1705 y.tbl[i][offset] = 0;
14311706 }
14321707 else {
14331708 accum = xx[j+lo];
14341709 for (j1 = j + n; j1 < m; j1 += n)
14351710 add(accum, accum, xx[j1+lo]);
14361711 ToModularRep(t, accum, FFTInfo, TmpSpace);
1437 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1438 y.tbl[i][j] = t[i];
1712 for (i = 0; i < nprimes; i++) {
1713 y.tbl[i][offset] = t[i];
1714
14391715 }
14401716 }
1441 }
1442
1443 // FIXME: something to think about...part of the above logic
1444 // is essentially a matrix transpose, which could lead to bad
1445 // cache performance. I don't really know if that is an issue.
1446
1447 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1717
1718 offset = (offset + 1) & (n-1);
1719 }
1720
1721
1722 for (i = 0; i < nprimes; i++) {
14481723 long *yp = &y.tbl[i][0];
1449 FFTFwd(yp, yp, k, i);
1450 }
1451 }
1452
1453
1724 FFTRev1(yp, yp, k, i);
1725 }
1726
1727 }
1728
1729
1730
1731 #ifdef NTL_THREAD_BOOST
14541732
14551733 void RevToFFTRep(FFTRep& y, const vec_ZZ_p& x,
14561734 long k, long lo, long hi, long offset)
14581736 // using "inverted" evaluation points.
14591737
14601738 {
1739 BasicThreadPool *pool = GetThreadPool();
1740
1741 if (!pool || pool->active() || pool->NumThreads() == 1) {
1742 basic_RevToFFTRep(y, x, k, lo, hi, offset);
1743 return;
1744 }
1745
14611746 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1462 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1463
1464
1465 long n, i, j, m, j1;
1466 vec_long& t = ModularRepBuf;
1467 ZZ_p accum;
1747
1748 long n, m;
14681749
14691750 if (k > FFTInfo->MaxRoot)
14701751 ResourceError("Polynomial too big for FFT");
14721753 if (lo < 0)
14731754 LogicError("bad arg to ToFFTRep");
14741755
1475 t.SetLength(FFTInfo->NumPrimes);
1756 long nprimes = FFTInfo->NumPrimes;
14761757
14771758 hi = min(hi, x.length()-1);
14781759
14861767
14871768 offset = offset & (n-1);
14881769
1489 for (j = 0; j < n; j++) {
1490 if (j >= m) {
1491 for (i = 0; i < FFTInfo->NumPrimes; i++)
1492 y.tbl[i][offset] = 0;
1493 }
1494 else {
1495 accum = xx[j+lo];
1496 for (j1 = j + n; j1 < m; j1 += n)
1497 add(accum, accum, xx[j1+lo]);
1498 ToModularRep(t, accum, FFTInfo, TmpSpace);
1499 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1500 y.tbl[i][offset] = t[i];
1501
1770 ZZ_pContext local_context;
1771 local_context.save();
1772
1773 pool->exec_range(n,
1774 [lo, m, n, offset, xx, &y, nprimes, &local_context, FFTInfo]
1775 (long first, long last) {
1776
1777 local_context.restore();
1778 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1779 // TmpSpace is thread local!
1780
1781 vec_long& t = ModularRepBuf();
1782 t.SetLength(nprimes);
1783
1784 long local_offset = (offset + first) & (n-1);
1785
1786 NTL_ZZ_pRegister(accum);
1787
1788 for (long j = first; j < last; j++) {
1789 if (j >= m) {
1790 for (long i = 0; i < nprimes; i++)
1791 y.tbl[i][local_offset] = 0;
15021792 }
1503 }
1504
1505 offset = (offset + 1) & (n-1);
1506 }
1507
1508
1509 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1510 long *yp = &y.tbl[i][0];
1511 FFTRev1(yp, yp, k, i);
1512 }
1513
1514 }
1515
1516 void FromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
1793 else {
1794 accum = xx[j+lo];
1795 for (long j1 = j + n; j1 < m; j1 += n)
1796 add(accum, accum, xx[j1+lo]);
1797 ToModularRep(t, accum, FFTInfo, TmpSpace);
1798 for (long i = 0; i < nprimes; i++) {
1799 y.tbl[i][local_offset] = t[i];
1800
1801 }
1802 }
1803
1804 local_offset = (local_offset + 1) & (n-1);
1805 }
1806 } );
1807
1808 pool->exec_range(nprimes,
1809 [&y, k](long first, long last) {
1810 for (long i = first; i < last; i++) {
1811 long *yp = &y.tbl[i][0];
1812 FFTRev1(yp, yp, k, i);
1813 }
1814 } );
1815
1816 }
1817
1818
1819 #endif
1820
1821
1822
1823
1824
1825
1826 NTL_TBDECL(FromFFTRep)(ZZ_pX& x, FFTRep& y, long lo, long hi)
15171827
15181828 // converts from FFT-representation to coefficient representation
15191829 // only the coefficients lo..hi are computed
15251835
15261836 long k, n, i, j, l;
15271837
1528 vec_long& t = ModularRepBuf;
1529
1530 t.SetLength(FFTInfo->NumPrimes);
1838 vec_long& t = ModularRepBuf();
1839
1840 long nprimes = FFTInfo->NumPrimes;
1841 t.SetLength(nprimes);
15311842
15321843 k = y.k;
15331844 n = (1L << k);
15341845
15351846
1536 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1847 for (i = 0; i < nprimes; i++) {
15371848 long *yp = &y.tbl[i][0];
15381849 FFTRev1(yp, yp, k, i);
15391850 }
15441855 x.rep.SetLength(l);
15451856
15461857 for (j = 0; j < l; j++) {
1547 for (i = 0; i < FFTInfo->NumPrimes; i++)
1858 for (i = 0; i < nprimes; i++)
15481859 t[i] = y.tbl[i][j+lo];
15491860
15501861 FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
15531864 x.normalize();
15541865 }
15551866
1556 void RevFromFFTRep(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
1867 #ifdef NTL_THREAD_BOOST
1868
1869 void FromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
1870
1871 // converts from FFT-representation to coefficient representation
1872 // only the coefficients lo..hi are computed
1873
1874
1875 {
1876 BasicThreadPool *pool = GetThreadPool();
1877
1878 if (!pool || pool->active() || pool->NumThreads() == 1) {
1879 basic_FromFFTRep(x, y, lo, hi);
1880 return;
1881 }
1882 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1883
1884 long k, n, l;
1885
1886 long nprimes = FFTInfo->NumPrimes;
1887
1888 k = y.k;
1889 n = (1L << k);
1890
1891
1892 pool->exec_range(nprimes,
1893 [&y, k](long first, long last) {
1894 for (long i = first; i < last; i++) {
1895 long *yp = &y.tbl[i][0];
1896 FFTRev1(yp, yp, k, i);
1897 }
1898 } );
1899
1900 hi = min(hi, n-1);
1901 l = hi-lo+1;
1902 l = max(l, 0);
1903 x.rep.SetLength(l);
1904 ZZ_p *xx = x.rep.elts();
1905
1906 ZZ_pContext local_context;
1907 local_context.save();
1908
1909 pool->exec_range(l,
1910 [lo, xx, &y, nprimes, &local_context, FFTInfo]
1911 (long first, long last) {
1912
1913 local_context.restore();
1914 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1915 // TmpSpace is thread local!
1916
1917 vec_long& t = ModularRepBuf();
1918 t.SetLength(nprimes);
1919
1920 for (long j = first; j < last; j++) {
1921 for (long i = 0; i < nprimes; i++)
1922 t[i] = y.tbl[i][j+lo];
1923
1924 FromModularRep(xx[j], t, FFTInfo, TmpSpace);
1925 }
1926 } );
1927
1928 x.normalize();
1929 }
1930
1931
1932
1933 #endif
1934
1935
1936
1937
1938
1939
1940 NTL_TBDECL(RevFromFFTRep)(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
15571941
15581942 // converts from FFT-representation to coefficient representation
15591943 // using "inverted" evaluation points.
15671951
15681952 long k, n, i, j, l;
15691953
1570 vec_long& t = ModularRepBuf;
1954 vec_long& t = ModularRepBuf();
15711955
15721956 k = y.k;
15731957 n = (1L << k);
15741958
1575 t.SetLength(FFTInfo->NumPrimes);
1576
1577 for (i = 0; i < FFTInfo->NumPrimes; i++) {
1959 long nprimes = FFTInfo->NumPrimes;
1960 t.SetLength(nprimes);
1961
1962 for (i = 0; i < nprimes; i++) {
15781963 long *yp = &y.tbl[i][0];
15791964 FFTFwd(yp, yp, k, i);
15801965 }
15851970 x.SetLength(l);
15861971
15871972 for (j = 0; j < l; j++) {
1588 for (i = 0; i < FFTInfo->NumPrimes; i++)
1973 for (i = 0; i < nprimes; i++)
15891974 t[i] = y.tbl[i][j+lo];
15901975
15911976 FromModularRep(x[j], t, FFTInfo, TmpSpace);
15921977 }
15931978 }
15941979
1595 void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
1980
1981 #ifdef NTL_THREAD_BOOST
1982
1983 void RevFromFFTRep(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
1984 {
1985 BasicThreadPool *pool = GetThreadPool();
1986
1987 if (!pool || pool->active() || pool->NumThreads() == 1) {
1988 basic_RevFromFFTRep(x, y, lo, hi);
1989 return;
1990 }
1991 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1992
1993 long k, n, l;
1994
1995 long nprimes = FFTInfo->NumPrimes;
1996
1997 k = y.k;
1998 n = (1L << k);
1999
2000
2001 pool->exec_range(nprimes,
2002 [&y, k](long first, long last) {
2003 for (long i = first; i < last; i++) {
2004 long *yp = &y.tbl[i][0];
2005 FFTFwd(yp, yp, k, i);
2006 }
2007 } );
2008
2009 hi = min(hi, n-1);
2010 l = hi-lo+1;
2011 l = max(l, 0);
2012 x.SetLength(l);
2013 ZZ_p *xx = x.elts();
2014
2015 ZZ_pContext local_context;
2016 local_context.save();
2017
2018 pool->exec_range(l,
2019 [lo, xx, &y, nprimes, &local_context, FFTInfo]
2020 (long first, long last) {
2021
2022 local_context.restore();
2023 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
2024 // TmpSpace is thread local!
2025
2026 vec_long& t = ModularRepBuf();
2027 t.SetLength(nprimes);
2028
2029 for (long j = first; j < last; j++) {
2030 for (long i = 0; i < nprimes; i++)
2031 t[i] = y.tbl[i][j+lo];
2032
2033 FromModularRep(xx[j], t, FFTInfo, TmpSpace);
2034 }
2035 } );
2036
2037 }
2038
2039
2040
2041
2042 #endif
2043
2044
2045
2046
2047
2048
2049
2050 NTL_TBDECL(NDFromFFTRep)(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
15962051 {
15972052 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
15982053 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
16002055
16012056 long k, n, i, j, l;
16022057
1603 vec_long& t = ModularRepBuf;
1604
1605 t.SetLength(FFTInfo->NumPrimes);
2058 vec_long& t = ModularRepBuf();
2059
2060 long nprimes = FFTInfo->NumPrimes;
2061 t.SetLength(nprimes);
16062062 k = y.k;
16072063 n = (1L << k);
16082064
16092065 z.SetSize(k);
16102066
1611 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2067 for (i = 0; i < nprimes; i++) {
16122068 long *zp = &z.tbl[i][0];
16132069 const long *yp = &y.tbl[i][0];
16142070
16212077 x.rep.SetLength(l);
16222078
16232079 for (j = 0; j < l; j++) {
1624 for (i = 0; i < FFTInfo->NumPrimes; i++)
2080 for (i = 0; i < nprimes; i++)
16252081 t[i] = z.tbl[i][j+lo];
16262082
16272083 FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
16302086 x.normalize();
16312087 }
16322088
1633 void NDFromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
1634 {
1635 FFTRep z;
1636 NDFromFFTRep(x, y, lo, hi, z);
1637 }
1638
1639 void FromFFTRep(ZZ_p* x, FFTRep& y, long lo, long hi)
2089 #ifdef NTL_THREAD_BOOST
2090
2091 void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
16402092
16412093 // converts from FFT-representation to coefficient representation
16422094 // only the coefficients lo..hi are computed
16432095
16442096
16452097 {
2098 BasicThreadPool *pool = GetThreadPool();
2099
2100 if (!pool || pool->active() || pool->NumThreads() == 1) {
2101 basic_NDFromFFTRep(x, y, lo, hi, z);
2102 return;
2103 }
2104 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2105
2106 long k, n, l;
2107
2108 long nprimes = FFTInfo->NumPrimes;
2109
2110 k = y.k;
2111 n = (1L << k);
2112
2113 z.SetSize(k);
2114
2115 pool->exec_range(nprimes,
2116 [&y, &z, k](long first, long last) {
2117 for (long i = first; i < last; i++) {
2118 long *zp = &z.tbl[i][0];
2119 const long *yp = &y.tbl[i][0];
2120 FFTRev1(zp, yp, k, i);
2121 }
2122 } );
2123
2124 hi = min(hi, n-1);
2125 l = hi-lo+1;
2126 l = max(l, 0);
2127 x.rep.SetLength(l);
2128 ZZ_p *xx = x.rep.elts();
2129
2130 ZZ_pContext local_context;
2131 local_context.save();
2132
2133 pool->exec_range(l,
2134 [lo, xx, &z, nprimes, &local_context, FFTInfo]
2135 (long first, long last) {
2136
2137 local_context.restore();
2138 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
2139 // TmpSpace is thread local!
2140
2141 vec_long& t = ModularRepBuf();
2142 t.SetLength(nprimes);
2143
2144 for (long j = first; j < last; j++) {
2145 for (long i = 0; i < nprimes; i++)
2146 t[i] = z.tbl[i][j+lo];
2147
2148 FromModularRep(xx[j], t, FFTInfo, TmpSpace);
2149 }
2150 } );
2151
2152 x.normalize();
2153 }
2154
2155
2156
2157 #endif
2158
2159 void NDFromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
2160 {
2161 FFTRep z;
2162 NDFromFFTRep(x, y, lo, hi, z);
2163 }
2164
2165 NTL_TBDECL(FromFFTRep)(ZZ_p* x, FFTRep& y, long lo, long hi)
2166
2167 // converts from FFT-representation to coefficient representation
2168 // only the coefficients lo..hi are computed
2169
2170
2171 {
16462172 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
16472173 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
16482174
16492175
16502176 long k, n, i, j;
16512177
1652 vec_long& t = ModularRepBuf;
2178 vec_long& t = ModularRepBuf();
16532179
16542180 k = y.k;
16552181 n = (1L << k);
16562182
1657 t.SetLength(FFTInfo->NumPrimes);
1658
1659 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2183 long nprimes = FFTInfo->NumPrimes;
2184 t.SetLength(nprimes);
2185
2186 for (i = 0; i < nprimes; i++) {
16602187 long *yp = &y.tbl[i][0];
16612188 FFTRev1(yp, yp, k, i);
16622189 }
16652192 if (j >= n)
16662193 clear(x[j-lo]);
16672194 else {
1668 for (i = 0; i < FFTInfo->NumPrimes; i++)
2195 for (i = 0; i < nprimes; i++)
16692196 t[i] = y.tbl[i][j];
16702197
16712198 FromModularRep(x[j-lo], t, FFTInfo, TmpSpace);
16742201 }
16752202
16762203
1677 void mul(FFTRep& z, const FFTRep& x, const FFTRep& y)
2204 #ifdef NTL_THREAD_BOOST
2205
2206 void FromFFTRep(ZZ_p* x, FFTRep& y, long lo, long hi)
2207
2208 // converts from FFT-representation to coefficient representation
2209 // only the coefficients lo..hi are computed
2210
2211
2212 {
2213 BasicThreadPool *pool = GetThreadPool();
2214
2215 if (!pool || pool->active() || pool->NumThreads() == 1) {
2216 basic_FromFFTRep(x, y, lo, hi);
2217 return;
2218 }
2219
2220 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2221
2222
2223 long k, n, l;
2224
2225 k = y.k;
2226 n = (1L << k);
2227
2228 long nprimes = FFTInfo->NumPrimes;
2229
2230
2231 pool->exec_range(nprimes,
2232 [&y, k](long first, long last) {
2233 for (long i = first; i < last; i++) {
2234 long *yp = &y.tbl[i][0];
2235 FFTRev1(yp, yp, k, i);
2236 }
2237 } );
2238
2239
2240 ZZ_pContext local_context;
2241 local_context.save();
2242
2243 pool->exec_range(hi-lo+1,
2244 [n, lo, x, &y, nprimes, &local_context, FFTInfo]
2245 (long first, long last) {
2246
2247 local_context.restore();
2248 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
2249 // TmpSpace is thread local!
2250
2251 vec_long& t = ModularRepBuf();
2252 t.SetLength(nprimes);
2253
2254 for (long idx = first; idx < last; idx++) {
2255 long j = lo + idx;
2256
2257 if (j >= n)
2258 clear(x[j-lo]);
2259 else {
2260 for (long i = 0; i < nprimes; i++)
2261 t[i] = y.tbl[i][j];
2262
2263 FromModularRep(x[j-lo], t, FFTInfo, TmpSpace);
2264 }
2265 }
2266 } );
2267 }
2268
2269 #endif
2270
2271
2272 NTL_TBDECL(mul)(FFTRep& z, const FFTRep& x, const FFTRep& y)
16782273 {
16792274 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
16802275
16872282
16882283 z.SetSize(k);
16892284
1690 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2285 long nprimes = FFTInfo->NumPrimes;
2286
2287 for (i = 0; i < nprimes; i++) {
16912288 long *zp = &z.tbl[i][0];
16922289 const long *xp = &x.tbl[i][0];
16932290 const long *yp = &y.tbl[i][0];
17002297
17012298 }
17022299
1703 void sub(FFTRep& z, const FFTRep& x, const FFTRep& y)
1704 {
2300
2301 #ifdef NTL_THREAD_BOOST
2302
2303 void mul(FFTRep& z, const FFTRep& x, const FFTRep& y)
2304 {
2305 BasicThreadPool *pool = GetThreadPool();
2306
2307 if (!pool || pool->active() || pool->NumThreads() == 1) {
2308 basic_mul(z, x, y);
2309 return;
2310 }
17052311 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
17062312
1707 long k, n, i, j;
2313 long k, n;
17082314
17092315 if (x.k != y.k) LogicError("FFT rep mismatch");
17102316
17132319
17142320 z.SetSize(k);
17152321
1716 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2322 long nprimes = FFTInfo->NumPrimes;
2323
2324 pool->exec_range(nprimes,
2325 [&x, &y, &z, n](long first, long last) {
2326 for (long i = first; i < last; i++) {
2327 long *zp = &z.tbl[i][0];
2328 const long *xp = &x.tbl[i][0];
2329 const long *yp = &y.tbl[i][0];
2330 long q = GetFFTPrime(i);
2331 mulmod_t qinv = GetFFTPrimeInv(i);
2332
2333 for (long j = 0; j < n; j++)
2334 zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
2335 }
2336 } );
2337
2338 }
2339
2340 #endif
2341
2342
2343
2344 NTL_TBDECL(sub)(FFTRep& z, const FFTRep& x, const FFTRep& y)
2345 {
2346 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2347
2348 long k, n, i, j;
2349
2350 if (x.k != y.k) LogicError("FFT rep mismatch");
2351
2352 k = x.k;
2353 n = 1L << k;
2354
2355 z.SetSize(k);
2356
2357 long nprimes = FFTInfo->NumPrimes;
2358
2359 for (i = 0; i < nprimes; i++) {
17172360 long *zp = &z.tbl[i][0];
17182361 const long *xp = &x.tbl[i][0];
17192362 const long *yp = &y.tbl[i][0];
17222365 for (j = 0; j < n; j++)
17232366 zp[j] = SubMod(xp[j], yp[j], q);
17242367 }
1725 }
1726
1727 void add(FFTRep& z, const FFTRep& x, const FFTRep& y)
1728 {
2368
2369 }
2370
2371
2372 #ifdef NTL_THREAD_BOOST
2373
2374 void sub(FFTRep& z, const FFTRep& x, const FFTRep& y)
2375 {
2376 BasicThreadPool *pool = GetThreadPool();
2377
2378 if (!pool || pool->active() || pool->NumThreads() == 1) {
2379 basic_sub(z, x, y);
2380 return;
2381 }
17292382 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
17302383
1731 long k, n, i, j;
2384 long k, n;
17322385
17332386 if (x.k != y.k) LogicError("FFT rep mismatch");
17342387
17372390
17382391 z.SetSize(k);
17392392
1740 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2393 long nprimes = FFTInfo->NumPrimes;
2394
2395 pool->exec_range(nprimes,
2396 [&x, &y, &z, n](long first, long last) {
2397 for (long i = first; i < last; i++) {
2398 long *zp = &z.tbl[i][0];
2399 const long *xp = &x.tbl[i][0];
2400 const long *yp = &y.tbl[i][0];
2401 long q = GetFFTPrime(i);
2402
2403 for (long j = 0; j < n; j++)
2404 zp[j] = SubMod(xp[j], yp[j], q);
2405 }
2406 } );
2407
2408 }
2409
2410 #endif
2411
2412
2413
2414 NTL_TBDECL(add)(FFTRep& z, const FFTRep& x, const FFTRep& y)
2415 {
2416 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2417
2418 long k, n, i, j;
2419
2420 if (x.k != y.k) LogicError("FFT rep mismatch");
2421
2422 k = x.k;
2423 n = 1L << k;
2424
2425 z.SetSize(k);
2426
2427 long nprimes = FFTInfo->NumPrimes;
2428
2429 for (i = 0; i < nprimes; i++) {
17412430 long *zp = &z.tbl[i][0];
17422431 const long *xp = &x.tbl[i][0];
17432432 const long *yp = &y.tbl[i][0];
17462435 for (j = 0; j < n; j++)
17472436 zp[j] = AddMod(xp[j], yp[j], q);
17482437 }
1749 }
1750
1751
1752 void reduce(FFTRep& x, const FFTRep& a, long k)
2438
2439 }
2440
2441
2442 #ifdef NTL_THREAD_BOOST
2443
2444 void add(FFTRep& z, const FFTRep& x, const FFTRep& y)
2445 {
2446 BasicThreadPool *pool = GetThreadPool();
2447
2448 if (!pool || pool->active() || pool->NumThreads() == 1) {
2449 basic_add(z, x, y);
2450 return;
2451 }
2452 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2453
2454 long k, n;
2455
2456 if (x.k != y.k) LogicError("FFT rep mismatch");
2457
2458 k = x.k;
2459 n = 1L << k;
2460
2461 z.SetSize(k);
2462
2463 long nprimes = FFTInfo->NumPrimes;
2464
2465 pool->exec_range(nprimes,
2466 [&x, &y, &z, n](long first, long last) {
2467 for (long i = first; i < last; i++) {
2468 long *zp = &z.tbl[i][0];
2469 const long *xp = &x.tbl[i][0];
2470 const long *yp = &y.tbl[i][0];
2471 long q = GetFFTPrime(i);
2472
2473 for (long j = 0; j < n; j++)
2474 zp[j] = AddMod(xp[j], yp[j], q);
2475 }
2476 } );
2477
2478 }
2479
2480 #endif
2481
2482
2483
2484
2485
2486
2487 NTL_TBDECL(reduce)(FFTRep& x, const FFTRep& a, long k)
17532488 // reduces a 2^l point FFT-rep to a 2^k point FFT-rep
17542489 // input may alias output
17552490 {
17662501
17672502 x.SetSize(k);
17682503
1769 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2504
2505 long nprimes = FFTInfo->NumPrimes;
2506
2507 for (i = 0; i < nprimes; i++) {
17702508 ap = &a.tbl[i][0];
17712509 xp = &x.tbl[i][0];
17722510 for (j = 0; j < n; j++)
17742512 }
17752513 }
17762514
1777 void AddExpand(FFTRep& x, const FFTRep& a)
2515
2516 #ifdef NTL_THREAD_BOOST
2517
2518 void reduce(FFTRep& x, const FFTRep& a, long k)
2519 // reduces a 2^l point FFT-rep to a 2^k point FFT-rep
2520 // input may alias output
2521 {
2522 BasicThreadPool *pool = GetThreadPool();
2523
2524 if (!pool || pool->active() || pool->NumThreads() == 1) {
2525 basic_reduce(x, a, k);
2526 return;
2527 }
2528
2529 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2530
2531 long l, n;
2532
2533 l = a.k;
2534 n = 1L << k;
2535
2536 if (l < k) LogicError("reduce: bad operands");
2537
2538 x.SetSize(k);
2539
2540
2541 long nprimes = FFTInfo->NumPrimes;
2542
2543 pool->exec_range(nprimes,
2544 [&x, &a, n, l, k](long first, long last) {
2545 for (long i = first; i < last; i++) {
2546 const long *ap = &a.tbl[i][0];
2547 long *xp = &x.tbl[i][0];
2548 for (long j = 0; j < n; j++)
2549 xp[j] = ap[j << (l-k)];
2550 }
2551 } );
2552 }
2553
2554 #endif
2555
2556
2557
2558
2559 NTL_TBDECL(AddExpand)(FFTRep& x, const FFTRep& a)
17782560 // x = x + (an "expanded" version of a)
17792561 {
17802562 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
17872569
17882570 if (l < k) LogicError("AddExpand: bad args");
17892571
1790 for (i = 0; i < FFTInfo->NumPrimes; i++) {
2572
2573 long nprimes = FFTInfo->NumPrimes;
2574
2575 for (i = 0; i < nprimes; i++) {
17912576 long q = GetFFTPrime(i);
17922577 const long *ap = &a.tbl[i][0];
17932578 long *xp = &x.tbl[i][0];
17982583 }
17992584 }
18002585
1801
1802
1803 void ToZZ_pXModRep(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
2586 #ifdef NTL_THREAD_BOOST
2587
2588 void AddExpand(FFTRep& x, const FFTRep& a)
2589 // x = x + (an "expanded" version of a)
2590 {
2591 BasicThreadPool *pool = GetThreadPool();
2592
2593 if (!pool || pool->active() || pool->NumThreads() == 1) {
2594 basic_AddExpand(x, a);
2595 return;
2596 }
2597 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2598
2599 long l, k, n;
2600
2601 l = x.k;
2602 k = a.k;
2603 n = 1L << k;
2604
2605 if (l < k) LogicError("AddExpand: bad args");
2606
2607
2608 long nprimes = FFTInfo->NumPrimes;
2609
2610 pool->exec_range(nprimes,
2611 [&x, &a, n, l, k](long first, long last) {
2612 for (long i = first; i < last; i++) {
2613 long q = GetFFTPrime(i);
2614 const long *ap = &a.tbl[i][0];
2615 long *xp = &x.tbl[i][0];
2616 for (long j = 0; j < n; j++) {
2617 long j1 = j << (l-k);
2618 xp[j1] = AddMod(xp[j1], ap[j], q);
2619 }
2620 }
2621 } );
2622 }
2623
2624
2625 #endif
2626
2627
2628
2629
2630
2631 NTL_TBDECL(ToZZ_pXModRep)(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
18042632 {
18052633 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
18062634 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
18072635
18082636
18092637 long n, i, j;
1810 vec_long& t = ModularRepBuf;
1811
2638 vec_long& t = ModularRepBuf();
2639
2640
2641 long nprimes = FFTInfo->NumPrimes;
18122642 t.SetLength(FFTInfo->NumPrimes);
18132643
18142644 if (lo < 0)
18222652
18232653 for (j = 0; j < n; j++) {
18242654 ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
1825 for (i = 0; i < FFTInfo->NumPrimes; i++)
2655 for (i = 0; i < nprimes; i++)
18262656 y.tbl[i][j] = t[i];
18272657 }
18282658 }
18292659
1830
1831 void ToFFTRep(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
1832 {
2660 #ifdef NTL_THREAD_BOOST
2661 void ToZZ_pXModRep(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
2662 {
2663 BasicThreadPool *pool = GetThreadPool();
2664
2665 if (!pool || pool->active() || pool->NumThreads() == 1) {
2666 basic_ToZZ_pXModRep(y, x, lo, hi);
2667 return;
2668 }
2669
18332670 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
18342671
1835 vec_long s;
2672
2673 long n;
2674
2675 long nprimes = FFTInfo->NumPrimes;
2676
2677 if (lo < 0)
2678 LogicError("bad arg to ToZZ_pXModRep");
2679
2680 hi = min(hi, deg(x));
2681 n = max(hi-lo+1, 0);
2682
2683 y.SetSize(n);
2684
2685 const ZZ_p *xx = x.rep.elts();
2686
2687 ZZ_pContext local_context;
2688 local_context.save();
2689
2690 pool->exec_range(n,
2691 [lo, xx, &y, nprimes, &local_context, FFTInfo](long first, long last) {
2692
2693 local_context.restore();
2694 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
2695 // TmpSpace is thread local!
2696
2697 vec_long& t = ModularRepBuf();
2698 t.SetLength(nprimes);
2699
2700 for (long j = first; j < last; j++) {
2701 ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
2702 for (long i = 0; i < nprimes; i++)
2703 y.tbl[i][j] = t[i];
2704 }
2705 } );
2706 }
2707 #endif
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717 NTL_TBDECL(ToFFTRep)(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
2718 {
2719 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2720
18362721 long n, m, i, j;
18372722
18382723 if (k < 0 || lo < 0)
18462731 if (m > n)
18472732 LogicError("bad args to ToFFTRep");
18482733
1849 s.SetLength(n);
1850 long *sp = s.elts();
18512734
18522735 x.SetSize(k);
18532736
1854 long NumPrimes = FFTInfo->NumPrimes;
1855
1856 for (i = 0; i < NumPrimes; i++) {
2737 long nprimes = FFTInfo->NumPrimes;
2738
2739 if (m == 0) {
2740 for (i = 0; i < nprimes; i++) {
2741 long *xp = &x.tbl[i][0];
2742 for (j = m; j < n; j++)
2743 xp[j] = 0;
2744 }
2745 }
2746 else {
2747 for (i = 0; i < nprimes; i++) {
2748 long *xp = &x.tbl[i][0];
2749 long *ap = &a.tbl[i][0];
2750 for (j = 0; j < m; j++)
2751 xp[j] = ap[lo+j];
2752 for (j = m; j < n; j++)
2753 xp[j] = 0;
2754
2755 FFTFwd(xp, xp, k, i);
2756 }
2757 }
2758 }
2759
2760 #ifdef NTL_THREAD_BOOST
2761 void ToFFTRep(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
2762 {
2763 BasicThreadPool *pool = GetThreadPool();
2764
2765 if (!pool || pool->active() || pool->NumThreads() == 1) {
2766 basic_ToFFTRep(x, a, k, lo, hi);
2767 return;
2768 }
2769
2770 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2771
2772 long n, m;
2773
2774 if (k < 0 || lo < 0)
2775 LogicError("bad args to ToFFTRep");
2776
2777 if (hi > a.n-1) hi = a.n-1;
2778
2779 n = 1L << k;
2780 m = max(hi-lo+1, 0);
2781
2782 if (m > n)
2783 LogicError("bad args to ToFFTRep");
2784
2785
2786 x.SetSize(k);
2787
2788 long nprimes = FFTInfo->NumPrimes;
2789
2790 if (m == 0) {
2791 for (long i = 0; i < nprimes; i++) {
2792 long *xp = &x.tbl[i][0];
2793 for (long j = m; j < n; j++)
2794 xp[j] = 0;
2795 }
2796 }
2797 else {
2798
2799 pool->exec_range(nprimes,
2800 [&x, &a, lo, m, n, k](long first, long last) {
2801
2802 for (long i = first; i < last; i++) {
2803 long *xp = &x.tbl[i][0];
2804 long *ap = &a.tbl[i][0];
2805 for (long j = 0; j < m; j++)
2806 xp[j] = ap[lo+j];
2807 for (long j = m; j < n; j++)
2808 xp[j] = 0;
2809
2810 FFTFwd(xp, xp, k, i);
2811 }
2812 } );
2813
2814 }
2815 }
2816 #endif
2817
2818
2819
2820 void FromFFTRep(ZZ_pXModRep& x, const FFTRep& a)
2821 {
2822 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2823 long nprimes = FFTInfo->NumPrimes;
2824 long k = a.k;
2825 long n = 1L << k;
2826
2827 x.SetSize(n);
2828 for (long i = 0; i < nprimes; i++) {
18572829 long *xp = &x.tbl[i][0];
1858 long *ap = (m == 0 ? 0 : &a.tbl[i][0]);
1859 for (j = 0; j < m; j++)
1860 sp[j] = ap[lo+j];
1861 for (j = m; j < n; j++)
1862 sp[j] = 0;
1863
1864 FFTFwd(xp, sp, k, i);
1865 }
1866 }
2830 long *ap = &a.tbl[i][0];
2831 FFTRev1(xp, ap, k, i);
2832 }
2833 }
2834
2835 void FromZZ_pXModRep(ZZ_pX& x, const ZZ_pXModRep& a, long lo, long hi)
2836 {
2837 const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
2838 ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
2839
2840 long n = a.n;
2841 long nprimes = FFTInfo->NumPrimes;
2842
2843 vec_long& t = ModularRepBuf();
2844 t.SetLength(nprimes);
2845
2846 hi = min(hi, n-1);
2847 long l = hi-lo+1;
2848 l = max(l, 0);
2849 x.rep.SetLength(l);
2850
2851 for (long j = 0; j < l; j++) {
2852 for (long i = 0; i < nprimes; i++)
2853 t[i] = a.tbl[i][j+lo];
2854
2855 FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
2856 }
2857
2858 x.normalize();
2859 }
2860
2861
18672862
18682863
18692864
26263621 long n = NumBits(e);
26273622 long i;
26283623
2629 ZZ_pX h;
3624 ZZ_pX h, h1;
26303625
26313626 h.SetMaxLength(F.n);
26323627 set(h);
26333628
26343629 for (i = n - 1; i >= 0; i--) {
2635 SqrMod(h, h, F);
2636 if (bit(e, i))
2637 MulByXMod(h, h, F);
3630 if (bit(e, i)) {
3631 SqrMod(h1, h, F);
3632 MulByXMod(h, h1, F);
3633 // NOTE: MulByXMod gives much faster multicore performance
3634 // when output does not alias input
3635 }
3636 else
3637 SqrMod(h, h, F);
26383638 }
26393639
26403640 if (e < 0) InvMod(h, h, F);
00
11 #include <NTL/ZZ_pX.h>
2
2 #include <NTL/BasicThreadPool.h>
33 #include <NTL/new.h>
4
5
6
7
8
49
510 NTL_START_IMPL
611
898903
899904
900905
901 void InnerProduct(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
906
907 NTL_TBDECL(InnerProduct)(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
902908 const vec_ZZ_pX& H, long n, ZZVec& t)
903909 {
904910 NTL_ZZRegister(s);
924930 conv(x.rep[j], t[j]);
925931 x.normalize();
926932 }
933
934
935 #ifdef NTL_THREAD_BOOST
936
937 void InnerProduct(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
938 const vec_ZZ_pX& H, long n, ZZVec& t)
939 {
940 BasicThreadPool *pool = GetThreadPool();
941
942 if (!pool || pool->active() || pool->NumThreads() == 1) {
943 basic_InnerProduct(x, v, low, high, H, n, t);
944 return;
945 }
946
947 high = min(high, v.length()-1);
948 x.rep.SetLength(n);
949
950 ZZ_pContext local_context;
951 local_context.save();
952
953 pool->exec_range(n,
954 [low, high, &x, &t, &H, &v, &local_context](long first, long last) {
955
956 local_context.restore();
957
958 NTL_ZZRegister(s);
959
960 for (long j = first; j < last; j++) clear(t[j]);
961
962 for (long i = low; i <= high; i++) {
963 const vec_ZZ_p& h = H[i-low].rep;
964 long m = min(h.length(), last);
965 const ZZ& w = rep(v[i]);
966
967 for (long j = first; j < m; j++) {
968 mul(s, w, rep(h[j]));
969 add(t[j], t[j], s);
970 }
971 }
972
973 for (long j = first; j < last; j++) conv(x.rep[j], t[j]);
974 } );
975
976 x.normalize();
977 }
978
979 #endif
927980
928981
929982 void CompMod(ZZ_pX& x, const ZZ_pX& g, const ZZ_pXArgument& A,
9871040
9881041
9891042
990 NTL_THREAD_LOCAL long ZZ_pXArgBound = 0;
1043 NTL_CHEAP_THREAD_LOCAL long ZZ_pXArgBound = 0;
9911044
9921045
9931046 void CompMod(ZZ_pX& x, const ZZ_pX& g, const ZZ_pX& h, const ZZ_pXModulus& F)
11441197
11451198
11461199
1147 void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
1200 NTL_TBDECL(ProjectPowers)(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
11481201 const ZZ_pXArgument& H, const ZZ_pXModulus& F)
11491202
11501203 {
11781231 }
11791232 }
11801233
1234
1235 #ifdef NTL_THREAD_BOOST
1236
1237 void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
1238 const ZZ_pXArgument& H, const ZZ_pXModulus& F)
1239
1240 {
1241 BasicThreadPool *pool = GetThreadPool();
1242
1243 if (!pool || pool->active() || pool->NumThreads() == 1) {
1244 basic_ProjectPowers(x, a, k, H, F);
1245 return;
1246 }
1247
1248 long n = F.n;
1249
1250 if (a.length() > n || k < 0)
1251 LogicError("ProjectPowers: bad args");
1252 if (NTL_OVERFLOW(k, 1, 0))
1253 ResourceError("ProjectPowers: excessive args");
1254
1255
1256 long m = H.H.length()-1;
1257 long l = (k+m-1)/m - 1;
1258
1259 ZZ_pXMultiplier M;
1260 build(M, H.H[m], F);
1261
1262 vec_ZZ_p s(INIT_SIZE, n);
1263 s = a;
1264 StripZeroes(s);
1265
1266 x.SetLength(k);
1267
1268 ZZ_pContext local_context;
1269 local_context.save();
1270
1271
1272 for (long i = 0; i <= l; i++) {
1273 long m1 = min(m, k-i*m);
1274 ZZ_p* w = &x[i*m];
1275
1276 pool->exec_range(m1,
1277 [w, &H, &s, &local_context](long first, long last) {
1278 local_context.restore();
1279 for (long j = first; j < last; j++)
1280 InnerProduct(w[j], H.H[j].rep, s);
1281 } );
1282
1283
1284 if (i < l)
1285 UpdateMap(s, s, M, F);
1286 }
1287 }
1288
1289
1290 #endif
11811291
11821292
11831293 void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
682682 return !IsX(s);
683683 }
684684
685 NTL_THREAD_LOCAL long ZZ_pX_BlockingFactor = 10;
685 NTL_CHEAP_THREAD_LOCAL long ZZ_pX_BlockingFactor = 10;
686686
687687 void DDF(vec_pair_ZZ_pX_long& factors, const ZZ_pX& ff, const ZZ_pX& hh,
688688 long verbose)
14661466
14671467 /************* NEW DDF ****************/
14681468
1469 NTL_THREAD_LOCAL long ZZ_pX_GCDTableSize = 4;
1470 NTL_THREAD_LOCAL double ZZ_pXFileThresh = NTL_FILE_THRESH;
1471 NTL_THREAD_LOCAL static vec_ZZ_pX *BabyStepFile = 0;
1472 NTL_THREAD_LOCAL static vec_ZZ_pX *GiantStepFile = 0;
1473 NTL_THREAD_LOCAL static long use_files;
1469 NTL_CHEAP_THREAD_LOCAL long ZZ_pX_GCDTableSize = 4;
1470 NTL_CHEAP_THREAD_LOCAL double ZZ_pXFileThresh = NTL_FILE_THRESH;
1471 static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pX *BabyStepFile = 0;
1472 static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pX *GiantStepFile = 0;
1473 static NTL_CHEAP_THREAD_LOCAL long use_files;
14741474
14751475
14761476 static
44 #include <NTL/SmartPtr.h>
55
66 NTL_CLIENT
7
8 #ifdef NTL_THREADS
9 #error "NTL_THREADS does not work with classic LIP: use GMP instead"
10 #endif
711
812
913 #define MustAlloc(c, len) (!(c) || ((c)[-1] >> 1) < (len))
20992103 /* signs a and b are different...use _ntl_zsub */
21002104
21012105 if (anegative) {
2106 // UNSAFE
21022107 // FIXME: this is too ugly
21032108 a[0] = -sa;
21042109 NTL_SCOPE(guard) { if (!a_alias) a[0] = sa; };
21092114 guard.relax();
21102115 }
21112116 else {
2117 // UNSAFE
21122118 // FIXME: this is too ugly
21132119 b[0] = -sb;
21142120 NTL_SCOPE(guard) { if (!b_alias) b[0] = sb; };
22352241 /* signs of a and b are different...use _ntl_zadd */
22362242
22372243 if (anegative) {
2244 // UNSAFE
22382245 // FIXME: this is too ugly
22392246 a[0] = -sa;
22402247 NTL_SCOPE(guard) { if (!a_alias) a[0] = sa; };
22482255 c[0] = -c[0];
22492256 }
22502257 else {
2258 // UNSAFE
22512259 // FIXME: this is too ugly
22522260 b[0] = -sb;
22532261 NTL_SCOPE(guard) { if (!b_alias) b[0] = sb; };
23202328
23212329 // EXCEPTIONS: delay assignment to a[0] until after memory allocation,
23222330 // the remaining code is exception free
2331
2332 // UNSAFE
23232333
23242334 a[0] = sa;
23252335
26122622 /* and subtract from T3 */
26132623
26142624 {
2625 // UNSAFE
2626
26152627 long olda, oldb;
26162628
26172629 olda = a[hsa]; a[hsa] = sa-hsa;
26272639 /* recursively compute a_lo*b_lo into low part of c */
26282640 /* and subtract from T3 */
26292641
2642 // UNSAFE
2643
26302644 *a = hsa;
26312645 *b = hsa;
26322646
26522666
26532667 /* recursively compute b*a_hi into high part of c */
26542668 {
2669 // UNSAFE
2670
26552671 long olda;
26562672
26572673 olda = a[hsa]; a[hsa] = sa-hsa;
26602676 }
26612677
26622678 /* recursively compute b*a_lo into T */
2679
2680 // UNSAFE
26632681
26642682 *a = hsa;
26652683 kar_mul(T, a, b, stk);
27312749 kar_fold(T1, a, hsa);
27322750 kar_sq(T2, T1, stk);
27332751
2752 // UNSAFE
2753
27342754 olda = a[hsa]; a[hsa] = sa - hsa;
27352755 kar_sq(c + (hsa << 1), a + hsa, stk);
27362756 kar_sub(T2, c + (hsa << 1));
27372757 a[hsa] = olda;
27382758
2759 // UNSAFE
2760
27392761 *a = hsa;
27402762 kar_sq(c, a, stk);
27412763 kar_sub(T2, c);
27802802 }
27812803
27822804
2805
2806 // UNSAFE
27832807
27842808 sa = *a;
27852809 if (sa < 0) {
30153039 a = mem;
30163040 }
30173041
3042
3043 // UNSAFE
3044
30183045 sa = *a;
30193046
30203047 if (*a < 0) {
35213548 return;
35223549 }
35233550
3551 // UNSAFE
3552
35243553 sign = 0;
35253554 if (sa < 0) {
35263555 a[0] = sa = -sa;
36823711 _ntl_zintoz(_ntl_zsmod(a, -b[1]), rr);
36833712 return;
36843713 }
3714
3715 // UNSAFE
36853716
36863717 sign = 0;
36873718 if (sa < 0) {
54625493 return;
54635494 }
54645495
5496 // UNSAFE
54655497
54665498 if (m1negative = (mm1[0] < 0))
54675499 mm1[0] = -mm1[0];
61576189 long i;
61586190 _ntl_verylong a;
61596191 long bitpos, wordpos, bitoffset, diff;
6192 long nbits;
6193 unsigned long carry, tmp;
6194
6195 while (n > 0 && p[n-1] == 0) n--;
61606196
61616197 if (n <= 0) {
61626198 _ntl_zzero(x);
61636199 return;
61646200 }
61656201
6202
61666203 if (n > (NTL_MAX_LONG-(NTL_NBITS-1))/8)
61676204 ResourceError("ZZFromBytes: excessive length");
61686205
6169 sz = (n*8 + NTL_NBITS-1)/NTL_NBITS;
6206 nbits = 0;
6207 tmp = p[n-1];
6208 while (tmp) {
6209 tmp >>= 1;
6210 nbits++;
6211 }
6212
6213 sz = ((n-1)*8 + nbits + NTL_NBITS-1)/NTL_NBITS;
61706214
61716215 _ntl_zsetlength(x, sz);
61726216
61756219 for (i = 1; i <= sz; i++)
61766220 a[i] = 0;
61776221
6222 carry = 0;
61786223 for (i = 0; i < n; i++) {
61796224 bitpos = i*8;
61806225 wordpos = bitpos/NTL_NBITS;
61816226 bitoffset = bitpos - wordpos*NTL_NBITS;
61826227 diff = NTL_NBITS-bitoffset;
61836228
6184 if (diff < 8) {
6185 a[wordpos+1] |=
6229 a[wordpos+1] |= carry |
61866230 ((( ((unsigned long)(p[i])) & 255UL ) << bitoffset) & NTL_RADIXM);
6187 a[wordpos+2] = ( ((long)(p[i])) & 255 ) >> diff;
6188 }
6189 else {
6190 a[wordpos+1] |= (( ((long)(p[i])) & 255 ) << bitoffset);
6191 }
6192 }
6193
6194 while (sz > 1 && a[sz] == 0) sz--;
6231
6232 carry = ( ((unsigned long)(p[i])) & 255UL ) >> diff;
6233 }
6234
6235 a[sz] |= carry;
61956236 a[0] = sz;
61966237 }
61976238
66816722 }
66826723
66836724
6725 // boilerplate to provide compatible interface
6726 class _ntl_reduce_struct_plain : public _ntl_reduce_struct {
6727 public:
6728 _ntl_verylong_wrapped N;
6729
6730 void eval(_ntl_verylong *rres, _ntl_verylong *TT)
6731 {
6732 _ntl_zmod(*TT, N, rres);
6733 }
6734
6735 void adjust(_ntl_verylong *x) { }
6736 };
6737
6738 _ntl_reduce_struct *
6739 _ntl_reduce_struct_build(_ntl_verylong modulus, _ntl_verylong excess)
6740 {
6741 UniquePtr<_ntl_reduce_struct_plain> C;
6742 C.make();
6743
6744 _ntl_zcopy(modulus, &C->N);
6745
6746 return C.release();
6747 }
6748
6749
6750
6751
6752 // general preconditioned remainder
6753
6754 class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
6755 };
6756
6757 _ntl_general_rem_one_struct *
6758 _ntl_general_rem_one_struct_build(long p, long sz)
6759 {
6760 return 0;
6761 }
6762
6763 long
6764 _ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
6765 {
6766 return _ntl_zsmod(a, p);
6767 }
6768
6769
6770
8282
8383 #endif
8484
85 #if @{NTL_DISABLE_TLS_HACK}
86 #define NTL_DISABLE_TLS_HACK
87
88 /* Set if you want to compile NTL without "TLS hack"
89 *
90 * To re-build after changing this flag: rm *.o; make ntl.a
91 */
92
93 #endif
94
95 #if @{NTL_ENABLE_TLS_HACK}
96 #define NTL_ENABLE_TLS_HACK
97
98 /* Set if you want to compile NTL with "TLS hack"
99 *
100 * To re-build after changing this flag: rm *.o; make ntl.a
101 */
102
103 #endif
104
85105 #if @{NTL_THREADS}
86106 #define NTL_THREADS
87107
103123
104124 #endif
105125
126 #if @{NTL_THREAD_BOOST}
127 #define NTL_THREAD_BOOST
128
129 /* Set if you want to compile NTL to exploit threads internally.
130 *
131 * To re-build after changing this flag: rm *.o; make ntl.a
132 */
133
134 #endif
135 #
106136
107137 #if @{NTL_GMP_LIP}
108138 #define NTL_GMP_LIP
147177
148178 #endif
149179
150 #if @{NTL_PCLMUL}
151 #define NTL_PCLMUL
152
153 /*
154 * Use this flag for faster GF2X arithmetc.
155 * This enables the use of the PCLMUL instruction on x86-64
156 * machines.
157 *
158 * To re-build after changing this flag:
159 * rm GF2X.o; make ntl.a
160 */
161
162 #endif
163180
164181 #if @{FLAG_LONG_LONG_TYPE}
165182 #define NTL_LONG_LONG_TYPE @{NTL_LONG_LONG_TYPE}
300317 #if @{NTL_DISABLE_LONGDOUBLE}
301318 #define NTL_DISABLE_LONGDOUBLE
302319
303 /* Explicitly disables us of long double arithmetic in the
304 * single-precision modular arithmetic routines
320 /* Explicitly disables us of long double arithmetic
305321 */
306322
307323 #endif
310326 #if @{NTL_DISABLE_LONGLONG}
311327 #define NTL_DISABLE_LONGLONG
312328
313 /* Explicitly disables us of long long arithmetic in the
314 * single-precision modular arithmetic routines
315 */
316
317 #endif
318
319
329 /* Explicitly disables us of long long arithmetic
330 */
331
332 #endif
333
334 #if @{NTL_DISABLE_LL_ASM}
335 #define NTL_DISABLE_LL_ASM
336
337 /* Explicitly disables us of inline assembly as a replacement
338 * for long lobg arithmetic.
339 */
340
341 #endif
342
343
344 #if @{NTL_MAXIMIZE_SP_NBITS}
345 #define NTL_MAXIMIZE_SP_NBITS
346
347 /* Allows for 62-bit single-precision moduli on 64-bit platforms.
348 * By default, such moduli are restricted to 60 bits, which
349 * usually gives slightly better performance across a range of
350 * of parameters.
351 */
352
353 #endif
320354
321355 /*************************************************************************
322356 *
505539 #endif
506540
507541
542 #if @{NTL_CRT_ALTCODE}
543 #define NTL_CRT_ALTCODE
544
545 /*
546 * Employs an alternative CRT strategy.
547 * Only relevant with GMP.
548 * Seems to be marginally faster on some x86_64 platforms.
549 *
550 * To re-build after changing this flag:
551 * rm lip.o; make ntl.a
552 */
553
554 #endif
555
556 #if @{NTL_CRT_ALTCODE_SMALL}
557 #define NTL_CRT_ALTCODE_SMALL
558
559 /*
560 * Employs an alternative CRT strategy for small moduli.
561 * Only relevant with GMP.
562 * Seems to be marginally faster on some x86_64 platforms.
563 *
564 * To re-build after changing this flag:
565 * rm lip.o; make ntl.a
566 */
567
568 #endif
569
508570
509571 #if @{NTL_GF2X_ALTCODE}
510572 #define NTL_GF2X_ALTCODE
547609 #endif
548610
549611
612 #if @{NTL_PCLMUL}
613 #define NTL_PCLMUL
614
615 /*
616 * Use this flag for faster GF2X arithmetc.
617 * This enables the use of the PCLMUL instruction on x86-64
618 * machines.
619 *
620 * To re-build after changing this flag:
621 * rm GF2X.o; make ntl.a
622 */
623
624 #endif
550625
551626
552627
553628 @{WIZARD_HACK}
554629
555630
556
557
558 #endif
631 #endif
66 # Also, some shells do not handle "$@" correctly when
77 # no options are supplied, so this is handled as a special case.
88
9 ARGS=""
10 rm -f RETRY_CONFIG
9
10
1111
1212 if test $# -ne 0
1313 then
1616 perl DoConfig
1717 fi
1818
19 while test -f RETRY_CONFIG
20 do
21 ARGS="$ARGS `cat RETRY_CONFIG`"
22 rm RETRY_CONFIG
23
24 if test $# -ne 0
25 then
26 perl DoConfig "$@" $ARGS
27 else
28 perl DoConfig $ARGS
29 fi
30
31 done
32
33
34
2020 * side effect of forcing its argument into memory.
2121 */
2222
23 NTL_THREAD_LOCAL volatile double _ntl_IsFinite__local;
24 NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr1 = &_ntl_IsFinite__local;
25 NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr2 = &_ntl_IsFinite__local;
26 NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr3 = &_ntl_IsFinite__local;
27 NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr4 = &_ntl_IsFinite__local;
23 NTL_CHEAP_THREAD_LOCAL volatile double _ntl_IsFinite__local = 0;
2824
2925 long _ntl_IsFinite(double *p)
3026 {
31 *_ntl_IsFinite__ptr1 = *p;
32 *_ntl_IsFinite__ptr3 = (*_ntl_IsFinite__ptr2 - *p);
33 if (*_ntl_IsFinite__ptr4 != 0.0) return 0;
27 _ntl_IsFinite__local = *p;
28 double x1 = _ntl_IsFinite__local;
29 double x2 = _ntl_IsFinite__local;
30 double x3 = x1-x2;
31 if (x3 != 0.0) return 0;
3432 return 1;
3533 }
3634
4745
4846 void _ntl_ForceToMem(double *p)
4947 {
50 *_ntl_IsFinite__ptr1 = *p;
51 *p = *_ntl_IsFinite__ptr2;
48 _ntl_IsFinite__local = *p;
49 *p = _ntl_IsFinite__local;
5250 }
5351
5452
7573 * overly-agressive optimizing compilers from screwing things up.
7674 */
7775
78 NTL_THREAD_LOCAL volatile double _ntl_ldexp_zero = 0.0;
76 NTL_CHEAP_THREAD_LOCAL volatile double _ntl_ldexp_zero = 0.0;
7977
8078 double _ntl_ldexp(double x, long e)
8179 {
1212
1313 CXXFLAGS=-g -O2
1414 # Flags for the C++ compiler
15
16 CXXAUTOFLAGS=
17 # Flags for the C++ compiler, automatically generated by configuration script
1518
1619
1720 AR=ar
6972
7073 GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
7174 GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
72 GMP_OPT_LIB=# -lgmp # GMP
75 GMP_OPT_LIB=-lgmp # GMP
7376 # uncomment these if using GMP
7477
7578
136139 O16=$(O15)
137140 O17=$(O16)
138141 O18=$(O17) xdouble.o
139 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
142 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140143
141144 OBJ=$(O19)
142145
161164 S16=$(S15)
162165 S17=$(S16)
163166 S18=$(S17) xdouble.c
164 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
167 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165168
166169 SRC = $(S19)
167170
193196 IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194197 IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195198 IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197 IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199 INCL=$(IN20)
199 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
200 INCL=$(IN19)
200201
201202
202203
212213 # test source files
213214
214215 TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215 TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
216 TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216217 TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217218 TS4=$(TS3) ThreadTest.c ExceptionTest.c
218219 TS = $(TS4)
219220
220221 # scripts
221222
222 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
223 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223224 SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224225
225226 SCRIPTS=$(SCRIPTS2)
226227
227228 # auxilliary source
228229
229 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230 GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
230 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
231 GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231232 GP=GetPID1.c GetPID2.c TestGetPID.c
232 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
233 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
234
235 AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233236
234237
235238
236239 # documentation
237240
238241
239 D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
242 D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240243 D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241244 D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242245 D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt
252255 D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253256 D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254257 D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255 D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
258 D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256259 D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257260
258261 TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt
263266 TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264267 TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265268 TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
269 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267270
268271 TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269272
275278 HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276279 HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277280 HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
281 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279282
280283 HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281284
287290 # test program executables
288291
289292 PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290 PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
293 PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291294 PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292295 PROGS = $(PROG3)
293296
294297 # things to save to a tar file
295298
296299 SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297 SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
300 SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298301 SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299302 SFILES=$(SFI3)
300303
309312 NTL_INCLUDE = -I../include -I.
310313 # NTL needs this to find its include files
311314
312 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
315 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
316
317 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315318
316319
317320
341344 # setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342345
343346 setup2:
347 echo "*** CheckFeature log ***" > CheckFeature.log
344348 sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345349 sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346 sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347 sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
350 sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
351 sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
352 sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
353 sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348354
349355 # setup3 generates the file ../include/NTL/gmp_aux.h
350356 # The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h
382388 GetPID.o: GetPID.c
383389 $(LCOMP) $(COMPILE) GetPID.c
384390
385 CheckPCLMUL: CheckPCLMUL.c
386 $(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
391 CheckCompile: CheckCompile.c
392 $(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
393
387394
388395 .c.o:
389396 $(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<
460467
461468 clobber:
462469 rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463 cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464 cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
470 sh ResetFeatures '..'
465471 rm -f ../include/NTL/gmp_aux.h
466 sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
472 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467473 rm -f *.o
468474 rm -rf small
469475 rm -f cfileout mfileout
471477 rm -f all
472478
473479 clean:
474 sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
480 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475481 rm -f *.o
476482 rm -rf small
477483 # - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR
497503
498504
499505 package:
506 ./configure --nowrite
507 cp mfileout def_makefile
508 cp cfileout ../include/NTL/def_config.h
500509 sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501510 rm -rf `cat DIRNAME`
502511 rm -f `cat DIRNAME`.tar
508517 rm -rf `cat DIRNAME`
509518
510519 winpack:
520 ./configure --nowrite NTL_GMP_LIP=off
521 cp mfileout def_makefile
522 cp cfileout ../include/NTL/def_config.h
511523 sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512524 rm -rf `cat WINDIR`
513525 rm -f `cat WINDIR`.zip
526538
527539 WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528540 WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
541 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530542
531543 WOBJ = $(WO3)
532544
538550 MulTimeTest:
539551 $(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540552
541 PolyTimeTest:
542 $(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543553
544554 Poly1TimeTest:
545555 $(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
556 Poly2TimeTest:
557 $(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
558 Poly3TimeTest:
559 $(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546560
547561
548562 GF2XTimeTest:
2020 cp mach_desc.win dos/include/NTL/mach_desc.h
2121
2222
23 cp GetTime0.c dos/GetTime/GetTime0.cpp
2324 cp GetTime1.c dos/GetTime/GetTime1.cpp
2425 cp GetTime2.c dos/GetTime/GetTime2.cpp
2526 cp GetTime3.c dos/GetTime/GetTime3.cpp
6061 cp $6 dos/src
6162
6263 cp ../include/NTL/def_config.h dos/include/NTL/config.h
63 cp ../include/NTL/have_LL_no.h dos/include/NTL/have_LL.h
64 cp ../include/NTL/have_builtin_clzl_no.h dos/include/NTL/have_builtin_clzl.h
64 sh ResetFeatures dos
6565
6666
8181
8282 const char *FileName(const char* stem, long d)
8383 {
84 NTL_THREAD_LOCAL static string sbuf;
84 NTL_TLS_LOCAL(string, sbuf);
8585
8686 stringstream ss;
8787 ss << "tmp-ntl-" << stem;
111111 static AtomicCounter cnt; // a GLOBAL counter
112112
113113
114 NTL_THREAD_LOCAL static string ID;
115 NTL_THREAD_LOCAL static bool initialized = false;
116 NTL_THREAD_LOCAL static unsigned long local_cnt = cnt.inc();
117 NTL_THREAD_LOCAL static unsigned long local_time = time(0);
118 NTL_THREAD_LOCAL static unsigned long local_clock = clock();
114 NTL_TLS_LOCAL(string, ID);
115
116 NTL_TLS_LOCAL_INIT(bool, initialized, (false));
117 NTL_TLS_LOCAL_INIT(unsigned long, local_cnt, (cnt.inc()));
118 NTL_TLS_LOCAL_INIT(unsigned long, local_time, (time(0)));
119 NTL_TLS_LOCAL_INIT(unsigned long, local_clock, (clock()));
119120
120121 if (!initialized) {
121122 stringstream ss;
342342
343343
344344
345 #if (defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG)
346 #define NTL_VIABLE_LL
347 #endif
348
349 #if (defined(NTL_CRT_ALTCODE) || defined(NTL_CRT_ALTCODE_SMALL))
350 #define NTL_TBL_CRT
351 #endif
352
353
345354
346355 class _ntl_gbigint_watcher {
347356 public:
394403 // this logic onto what was originally pure-C code.
395404
396405
397 #define GRegister(x) NTL_THREAD_LOCAL static _ntl_gbigint_wrapped x; _ntl_gbigint_watcher _WATCHER__ ## x(&x)
406 #define GRegister(x) NTL_TLS_LOCAL(_ntl_gbigint_wrapped, x); _ntl_gbigint_watcher _WATCHER__ ## x(&x)
398407
399408 // #define GRegister(x) NTL_THREAD_LOCAL static _ntl_gbigint x(0); _ntl_gbigint_watcher _WATCHER__ ## x(&x)
400409
13261335
13271336 GET_SIZE_NEG(sn, nneg, n);
13281337
1329 limb_cnt = k/NTL_ZZ_NBITS;
1330 k %= NTL_ZZ_NBITS;
1338 limb_cnt = ((unsigned long) k) / NTL_ZZ_NBITS;
1339 k = ((unsigned long) k) % NTL_ZZ_NBITS;
13311340 sres = sn + limb_cnt;
13321341 if (k != 0) sres++;
13331342
13851394
13861395 GET_SIZE_NEG(sn, nneg, n);
13871396
1388 limb_cnt = k/NTL_ZZ_NBITS;
1397 limb_cnt = ((unsigned long) k) / NTL_ZZ_NBITS;
13891398
13901399 sres = sn - limb_cnt;
13911400
14041413 ndata = DATA(n);
14051414 resdata = DATA(res);
14061415 ndata1 = ndata + limb_cnt;
1407 k %= NTL_ZZ_NBITS;
1416 k = ((unsigned long) k) % NTL_ZZ_NBITS;
14081417
14091418 if (k != 0) {
14101419 mpn_rshift(resdata, ndata1, sres, k);
15221531 void
15231532 _ntl_gsadd(_ntl_gbigint a, long b, _ntl_gbigint *cc)
15241533 {
1534 // FIXME: this is really inefficient...too much overhead
15251535 GRegister(B);
15261536 _ntl_gintoz(b, &B);
15271537 _ntl_gadd(a, B, cc);
29102920 {
29112921 GRegister(tmp);
29122922
2913 NTL_THREAD_LOCAL static double log_2;
2914 NTL_THREAD_LOCAL static long init = 0;
2923 static const double log_2 = log(2.0); // GLOBAL (assumes C++11 thread-safe init)
29152924
29162925 long s;
29172926 long shamt;
29182927 long correction;
29192928 double x;
2920
2921 if (!init) {
2922 log_2 = log(2.0);
2923 init = 1;
2924 }
29252929
29262930 if (_ntl_gsign(n) <= 0)
29272931 ArithmeticError("log argument <= 0");
35023506 for (i = 0; i < m; i++) {
35033507 q = Tdata[i]*inv;
35043508 d = mpn_addmul_1(Tdata+i, Ndata, n, q);
3509
3510 // (c, Tdata[i+n]) = c + d + Tdata[i+n]
35053511 t = Tdata[i+n] + d;
35063512 Tdata[i+n] = t + c;
35073513 if (t < d || (c == 1 && t + c == 0))
35233529
35243530 SIZE(res) = i;
35253531 SIZE(T) = 0;
3532 }
3533
3534
3535 // This montgomery code is for external consumption...
3536 // This is currently used in the CRT reconstruction step
3537 // for ZZ_pX arithmetic. It gives a nontrivial speedup
3538 // for smallish p (up to a few hundred bits)
3539
3540 class _ntl_reduce_struct_montgomery : public _ntl_reduce_struct {
3541 public:
3542 long m;
3543 mp_limb_t inv;
3544 _ntl_gbigint_wrapped N;
3545
3546 void eval(_ntl_gbigint *rres, _ntl_gbigint *TT);
3547 void adjust(_ntl_gbigint *x);
3548 };
3549
3550
3551
3552 // DIRT: may not work with non-empty "nails"
3553
3554 void _ntl_reduce_struct_montgomery::eval(_ntl_gbigint *rres, _ntl_gbigint *TT)
3555 {
3556 long n, sT, i;
3557 mp_limb_t *Ndata, *Tdata, *resdata, q, d, t, c;
3558 _ntl_gbigint res, T;
3559
3560
3561 T = *TT;
3562
3563 // quick zero test, in case of sparse polynomials
3564 if (ZEROP(T)) {
3565 _ntl_gzero(rres);
3566 return;
3567 }
3568
3569 n = SIZE(N);
3570 Ndata = DATA(N);
3571
3572 if (MustAlloc(T, m+n)) {
3573 _ntl_gsetlength(&T, m+n);
3574 *TT = T;
3575 }
3576
3577 res = *rres;
3578 if (MustAlloc(res, n)) {
3579 _ntl_gsetlength(&res, n);
3580 *rres = res;
3581 }
3582
3583 sT = SIZE(T);
3584 Tdata = DATA(T);
3585 resdata = DATA(res);
3586
3587 for (i = sT; i < m+n; i++)
3588 Tdata[i] = 0;
3589
3590 c = 0;
3591 for (i = 0; i < m; i++) {
3592 q = Tdata[i]*inv;
3593 d = mpn_addmul_1(Tdata+i, Ndata, n, q);
3594
3595 // (c, Tdata[i+n]) = c + d + Tdata[i+n]
3596 t = Tdata[i+n] + d;
3597 Tdata[i+n] = t + c;
3598 if (t < d || (c == 1 && t + c == 0))
3599 c = 1;
3600 else
3601 c = 0;
3602 }
3603
3604 if (c || mpn_cmp(Tdata + m, Ndata, n) >= 0) {
3605 mpn_sub_n(resdata, Tdata + m, Ndata, n);
3606 }
3607 else {
3608 for (i = 0; i < n; i++)
3609 resdata[i] = Tdata[m + i];
3610 }
3611
3612 i = n;
3613 STRIP(i, resdata);
3614
3615 SIZE(res) = i;
3616 SIZE(T) = 0;
3617 }
3618
3619 // this will adjust the given number by multiplying by the
3620 // montgomery scaling factor
3621
3622 void _ntl_reduce_struct_montgomery::adjust(_ntl_gbigint *x)
3623 {
3624 GRegister(tmp);
3625 _ntl_glshift(*x, m*NTL_ZZ_NBITS, &tmp);
3626 _ntl_gmod(tmp, N, x);
3627 }
3628
3629
3630
3631
3632 class _ntl_reduce_struct_plain : public _ntl_reduce_struct {
3633 public:
3634 _ntl_gbigint_wrapped N;
3635
3636 void eval(_ntl_gbigint *rres, _ntl_gbigint *TT)
3637 {
3638 _ntl_gmod(*TT, N, rres);
3639 }
3640
3641 void adjust(_ntl_gbigint *x) { }
3642 };
3643
3644 // assumption: all values passed to eval for montgomery reduction
3645 // are in [0, modulus*excess]
3646
3647 _ntl_reduce_struct *
3648 _ntl_reduce_struct_build(_ntl_gbigint modulus, _ntl_gbigint excess)
3649 {
3650 if (_ntl_godd(modulus)) {
3651 UniquePtr<_ntl_reduce_struct_montgomery> C;
3652 C.make();
3653
3654 C->m = _ntl_gsize(excess);
3655 C->inv = neg_inv_mod_limb(DATA(modulus)[0]);
3656 _ntl_gcopy(modulus, &C->N);
3657
3658 return C.release();
3659 }
3660 else {
3661 UniquePtr<_ntl_reduce_struct_plain> C;
3662 C.make();
3663
3664 _ntl_gcopy(modulus, &C->N);
3665
3666 return C.release();
3667 }
35263668 }
35273669
35283670
38393981
38403982 void _ntl_gfrombytes(_ntl_gbigint *x, const unsigned char *p, long n)
38413983 {
3842 long BytesPerLimb;
38433984 long lw, r, i, j;
38443985 mp_limb_t *xp, t;
38453986
3987 while (n > 0 && p[n-1] == 0) n--;
3988
38463989 if (n <= 0) {
3847 x = 0;
3990 _ntl_gzero(x);
38483991 return;
38493992 }
38503993
3851 BytesPerLimb = NTL_ZZ_NBITS/8;
3994 const long BytesPerLimb = NTL_ZZ_NBITS/8;
38523995
38533996
38543997 lw = n/BytesPerLimb;
38824025 t >>= (BytesPerLimb-r)*8;
38834026 xp[lw-1] = t;
38844027
3885 STRIP(lw, xp);
4028 // strip not necessary here
4029 // STRIP(lw, xp);
38864030 SIZE(*x) = lw;
38874031 }
38884032
38924036
38934037 void _ntl_gbytesfromz(unsigned char *p, _ntl_gbigint a, long n)
38944038 {
3895 long BytesPerLimb;
38964039 long lbits, lbytes, min_bytes, min_words, r;
38974040 long i, j;
38984041 mp_limb_t *ap, t;
38994042
39004043 if (n < 0) n = 0;
39014044
3902 BytesPerLimb = NTL_ZZ_NBITS/8;
4045 const long BytesPerLimb = NTL_ZZ_NBITS/8;
39034046
39044047 lbits = _ntl_g2log(a);
39054048 lbytes = (lbits+7)/8;
41744317 void eval(_ntl_gbigint *x, const long *b, _ntl_tmp_vec *tmp_vec);
41754318 };
41764319
4320
4321 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
4322
4323 class _ntl_crt_struct_tbl : public _ntl_crt_struct {
4324 public:
4325 Unique2DArray<mp_limb_t> v;
4326 long n;
4327 long sz;
4328
4329 bool special();
4330 void insert(long i, _ntl_gbigint m);
4331 _ntl_tmp_vec *extract();
4332 _ntl_tmp_vec *fetch();
4333 void eval(_ntl_gbigint *x, const long *b, _ntl_tmp_vec *tmp_vec);
4334
4335 };
4336
4337 #endif
4338
4339
4340
4341
41774342 class _ntl_crt_struct_fast : public _ntl_crt_struct {
41784343 public:
41794344 long n;
43074472 return C.release();
43084473 }
43094474
4475
4476 #if (defined(NTL_VIABLE_LL))
4477
4478 // alternative CRT code is viable
4479
4480 #if (defined(NTL_CRT_ALTCODE))
4481 // unconditionally use the alternative code,
4482 // as the tuning wizard says its preferable for larger moduli
4483
4484 {
4485 UniquePtr<_ntl_crt_struct_tbl> C;
4486 C.make();
4487 C->n = n;
4488 C->sz = SIZE(p);
4489 C->v.SetDims(C->sz, C->n);
4490
4491 return C.release();
4492 }
4493 #elif (defined(NTL_CRT_ALTCODE_SMALL))
4494 // use the alternative code on "smaller" moduli...
4495 // For now, this triggers when n <= 16.
4496 // Unless the "long long" compiler support is really bad,
4497 // this should be a marginal win, as it avoids some
4498 // procedure call overhead.
4499
4500 if (n <= 16) {
4501 UniquePtr<_ntl_crt_struct_tbl> C;
4502 C.make();
4503 C->n = n;
4504 C->sz = SIZE(p);
4505 C->v.SetDims(C->sz, C->n);
4506
4507 return C.release();
4508 }
4509 else {
4510 UniquePtr<_ntl_crt_struct_basic> C;
4511 C.make();
4512
4513 long i;
4514
4515 C->n = n;
4516 C->v.SetLength(n);
4517 C->sbuf = SIZE(p)+2;
4518
4519 return C.release();
4520 }
4521 #else
43104522 {
43114523 UniquePtr<_ntl_crt_struct_basic> C;
43124524 C.make();
43194531
43204532 return C.release();
43214533 }
4534 #endif
4535
4536 #else
4537 {
4538 UniquePtr<_ntl_crt_struct_basic> C;
4539 C.make();
4540
4541 long i;
4542
4543 C->n = n;
4544 C->v.SetLength(n);
4545 C->sbuf = SIZE(p)+2;
4546
4547 return C.release();
4548 }
4549 #endif
4550
43224551 }
43234552
43244553 /* extracts existing tmp_vec, if possible -- read/write operation */
43274556 {
43284557 return 0;
43294558 }
4559
4560 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
4561 _ntl_tmp_vec *_ntl_crt_struct_tbl::extract()
4562 {
4563 return 0;
4564 }
4565 #endif
43304566
43314567 _ntl_tmp_vec *_ntl_crt_struct_fast::extract()
43324568 {
43434579 {
43444580 return 0;
43454581 }
4582
4583 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
4584 _ntl_tmp_vec *_ntl_crt_struct_tbl::fetch()
4585 {
4586 return 0;
4587 }
4588 #endif
43464589
43474590 _ntl_tmp_vec *_ntl_crt_struct_fast::fetch()
43484591 {
43624605 {
43634606 _ntl_gcopy(m, &v[i]);
43644607 }
4608
4609 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
4610 void _ntl_crt_struct_tbl::insert(long i, _ntl_gbigint m)
4611 {
4612 if (i < 0 || i >= n) LogicError("insert: bad args");
4613
4614 if (!m)
4615 for (long j = 0; j < sz; j++) v[j][i] = 0;
4616 else {
4617 long sm = SIZE(m);
4618 if (sm < 0 || sm > sz) LogicError("insert: bad args");
4619 const mp_limb_t *mdata = DATA(m);
4620 for (long j = 0; j < sm; j++)
4621 v[j][i] = mdata[j];
4622 for (long j = sm; j < sz; j++)
4623 v[j][i] = 0;
4624 }
4625 }
4626 #endif
43654627
43664628 void _ntl_crt_struct_fast::insert(long i, _ntl_gbigint m)
43674629 {
44534715 SIZE(x1) = sx;
44544716 }
44554717
4718
4719 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
4720
4721 #define CRT_ALTCODE_UNROLL (1)
4722
4723 void _ntl_crt_struct_tbl::eval(_ntl_gbigint *x, const long *b, _ntl_tmp_vec *generic_tmp_vec)
4724 {
4725 long sx;
4726 _ntl_gbigint x1;
4727 long i, j;
4728
4729 // quick test for zero vector
4730 // most likely, they are either all zero (if we are working
4731 // with some sparse polynomials) or none of them are zero,
4732 // so in the general case, this should go fast
4733 if (!b[0]) {
4734 i = 1;
4735 while (i < n && !b[i]) i++;
4736 if (i >= n) {
4737 _ntl_gzero(x);
4738 return;
4739 }
4740 }
4741
4742 sx = sz + 2;
4743 _ntl_gsetlength(x, sx);
4744 x1 = *x;
4745 mp_limb_t * NTL_RESTRICT xx = DATA(x1);
4746
4747
4748 const long Bnd = 1L << (NTL_BITS_PER_LONG-NTL_SP_NBITS);
4749
4750 if (n <= Bnd) {
4751 mp_limb_t carry=0;
4752
4753 for (i = 0; i < sz; i++) {
4754 const mp_limb_t *row = v[i];
4755
4756 ll_type acc;
4757 ll_mul(acc, row[0], b[0]);
4758
4759 #if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
4760 switch (n) {
4761 case 16: ll_mul_add(acc, row[16-1], b[16-1]);
4762 case 15: ll_mul_add(acc, row[15-1], b[15-1]);
4763 case 14: ll_mul_add(acc, row[14-1], b[14-1]);
4764 case 13: ll_mul_add(acc, row[13-1], b[13-1]);
4765 case 12: ll_mul_add(acc, row[12-1], b[12-1]);
4766 case 11: ll_mul_add(acc, row[11-1], b[11-1]);
4767 case 10: ll_mul_add(acc, row[10-1], b[10-1]);
4768 case 9: ll_mul_add(acc, row[9-1], b[9-1]);
4769 case 8: ll_mul_add(acc, row[8-1], b[8-1]);
4770 case 7: ll_mul_add(acc, row[7-1], b[7-1]);
4771 case 6: ll_mul_add(acc, row[6-1], b[6-1]);
4772 case 5: ll_mul_add(acc, row[5-1], b[5-1]);
4773 case 4: ll_mul_add(acc, row[4-1], b[4-1]);
4774 case 3: ll_mul_add(acc, row[3-1], b[3-1]);
4775 case 2: ll_mul_add(acc, row[2-1], b[2-1]);
4776 }
4777 #else
4778 for (j = 1; j < n; j++)
4779 ll_mul_add(acc, row[j], b[j]);
4780 #endif
4781
4782 ll_add(acc, carry);
4783 xx[i] = ll_get_lo(acc);
4784 carry = ll_get_hi(acc);
4785 }
4786
4787 xx[sz] = carry;
4788 xx[sz+1] = 0;
4789 }
4790 else {
4791 ll_type carry;
4792 ll_init(carry, 0);
4793
4794 for (i = 0; i < sz; i++) {
4795 const mp_limb_t *row = v[i];
4796
4797 ll_type acc21;
4798 mp_limb_t acc0;
4799
4800 {
4801 ll_type sum;
4802 ll_mul(sum, row[0], b[0]);
4803
4804 #if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
4805 ll_mul_add(sum, row[1], b[1]);
4806 ll_mul_add(sum, row[2], b[2]);
4807 ll_mul_add(sum, row[3], b[3]);
4808 ll_mul_add(sum, row[4], b[4]);
4809 ll_mul_add(sum, row[5], b[5]);
4810 ll_mul_add(sum, row[6], b[6]);
4811 ll_mul_add(sum, row[7], b[7]);
4812 ll_mul_add(sum, row[8], b[8]);
4813 ll_mul_add(sum, row[9], b[9]);
4814 ll_mul_add(sum, row[10], b[10]);
4815 ll_mul_add(sum, row[11], b[11]);
4816 ll_mul_add(sum, row[12], b[12]);
4817 ll_mul_add(sum, row[13], b[13]);
4818 ll_mul_add(sum, row[14], b[14]);
4819 ll_mul_add(sum, row[15], b[15]);
4820 #elif (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 2)
4821 ll_mul_add(sum, row[1], b[1]);
4822 ll_mul_add(sum, row[2], b[2]);
4823 ll_mul_add(sum, row[3], b[3]);
4824 #else
4825 for (j = 1; j < Bnd; j++)
4826 ll_mul_add(sum, row[j], b[j]);
4827 #endif
4828
4829
4830 ll_init(acc21, ll_get_hi(sum));
4831 acc0 = ll_get_lo(sum);
4832 }
4833
4834 const mp_limb_t *ap = row;
4835 const long *tp = b;
4836
4837 #if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 2)
4838 long m = n - 4;
4839 ap += 4;
4840 tp += 4;
4841
4842 for (; m >= 8; m -= 8, ap += 8, tp += 8) {
4843 {
4844 ll_type sum;
4845 ll_mul(sum, ap[0], tp[0]);
4846 ll_mul_add(sum, ap[1], tp[1]);
4847 ll_mul_add(sum, ap[2], tp[2]);
4848 ll_mul_add(sum, ap[3], tp[3]);
4849
4850 ll_add(sum, acc0);
4851 acc0 = ll_get_lo(sum);
4852 ll_add(acc21, ll_get_hi(sum));
4853 }
4854 {
4855 ll_type sum;
4856 ll_mul(sum, ap[4+0], tp[4+0]);
4857 ll_mul_add(sum, ap[4+1], tp[4+1]);
4858 ll_mul_add(sum, ap[4+2], tp[4+2]);
4859 ll_mul_add(sum, ap[4+3], tp[4+3]);
4860
4861 ll_add(sum, acc0);
4862 acc0 = ll_get_lo(sum);
4863 ll_add(acc21, ll_get_hi(sum));
4864 }
4865 }
4866
4867 for (; m >= 4; m -= 4, ap += 4, tp += 4) {
4868 ll_type sum;
4869 ll_mul(sum, ap[0], tp[0]);
4870 ll_mul_add(sum, ap[1], tp[1]);
4871 ll_mul_add(sum, ap[2], tp[2]);
4872 ll_mul_add(sum, ap[3], tp[3]);
4873
4874 ll_add(sum, acc0);
4875 acc0 = ll_get_lo(sum);
4876 ll_add(acc21, ll_get_hi(sum));
4877 }
4878
4879
4880 #else
4881 long m;
4882 for (m = n-Bnd, ap += Bnd, tp += Bnd; m >= Bnd; m -= Bnd, ap += Bnd, tp += Bnd) {
4883
4884 ll_type sum;
4885 ll_mul(sum, ap[0], tp[0]);
4886
4887 #if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
4888 ll_mul_add(sum, ap[1], tp[1]);
4889 ll_mul_add(sum, ap[2], tp[2]);
4890 ll_mul_add(sum, ap[3], tp[3]);
4891 ll_mul_add(sum, ap[4], tp[4]);
4892 ll_mul_add(sum, ap[5], tp[5]);
4893 ll_mul_add(sum, ap[6], tp[6]);
4894 ll_mul_add(sum, ap[7], tp[7]);
4895 ll_mul_add(sum, ap[8], tp[8]);
4896 ll_mul_add(sum, ap[9], tp[9]);
4897 ll_mul_add(sum, ap[10], tp[10]);
4898 ll_mul_add(sum, ap[11], tp[11]);
4899 ll_mul_add(sum, ap[12], tp[12]);
4900 ll_mul_add(sum, ap[13], tp[13]);
4901 ll_mul_add(sum, ap[14], tp[14]);
4902 ll_mul_add(sum, ap[15], tp[15]);
4903 #else
4904 for (long j = 1; j < Bnd; j++)
4905 ll_mul_add(sum, ap[j], tp[j]);
4906 #endif
4907
4908 ll_add(sum, acc0);
4909 acc0 = ll_get_lo(sum);
4910 ll_add(acc21, ll_get_hi(sum));
4911 }
4912 #endif
4913
4914 if (m > 0) {
4915 ll_type sum;
4916 ll_mul(sum, ap[0], tp[0]);
4917
4918 #if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
4919 switch (m) {
4920 case 15: ll_mul_add(sum, ap[15-1], tp[15-1]);
4921 case 14: ll_mul_add(sum, ap[14-1], tp[14-1]);
4922 case 13: ll_mul_add(sum, ap[13-1], tp[13-1]);
4923 case 12: ll_mul_add(sum, ap[12-1], tp[12-1]);
4924 case 11: ll_mul_add(sum, ap[11-1], tp[11-1]);
4925 case 10: ll_mul_add(sum, ap[10-1], tp[10-1]);
4926 case 9: ll_mul_add(sum, ap[9-1], tp[9-1]);
4927 case 8: ll_mul_add(sum, ap[8-1], tp[8-1]);
4928 case 7: ll_mul_add(sum, ap[7-1], tp[7-1]);
4929 case 6: ll_mul_add(sum, ap[6-1], tp[6-1]);
4930 case 5: ll_mul_add(sum, ap[5-1], tp[5-1]);
4931 case 4: ll_mul_add(sum, ap[4-1], tp[4-1]);
4932 case 3: ll_mul_add(sum, ap[3-1], tp[3-1]);
4933 case 2: ll_mul_add(sum, ap[2-1], tp[2-1]);
4934 }
4935 #else
4936 for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
4937 ll_mul_add(sum, ap[0], tp[0]);
4938 #endif
4939 ll_add(sum, acc0);
4940 acc0 = ll_get_lo(sum);
4941 ll_add(acc21, ll_get_hi(sum));
4942
4943 }
4944
4945 ll_add(carry, acc0);
4946 xx[i] = ll_get_lo(carry);
4947 ll_add(acc21, ll_get_hi(carry));
4948 carry = acc21;
4949 }
4950
4951 xx[sz] = ll_get_lo(carry);
4952 xx[sz+1] = ll_get_hi(carry);
4953 }
4954
4955
4956 while (sx > 0 && xx[sx-1] == 0) sx--;
4957 SIZE(x1) = sx;
4958 }
4959 #endif
44564960
44574961 void _ntl_crt_struct_fast::eval(_ntl_gbigint *x, const long *b, _ntl_tmp_vec *generic_tmp_vec)
44584962 {
44985002
44995003
45005004 bool _ntl_crt_struct_basic::special() { return false; }
5005
5006 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
5007 bool _ntl_crt_struct_tbl::special() { return false; }
5008 #endif
5009
5010
45015011 bool _ntl_crt_struct_fast::special() { return true; }
45025012
45035013
45585068
45595069
45605070
4561 #ifdef NTL_TBL_REM
5071 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
45625072
45635073 class _ntl_rem_struct_tbl : public _ntl_rem_struct {
45645074 public:
45795089 _ntl_rem_struct *_ntl_rem_struct_build(long n, _ntl_gbigint modulus, long (*p)(long))
45805090 {
45815091
4582 #ifdef NTL_TBL_REM
4583 if (n <= 800
4584 && sizeof(NTL_ULL_TYPE) == 2*sizeof(long)
4585 && NTL_ZZ_NBITS == NTL_BITS_PER_LONG) {
4586
5092 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
5093 if (n <= 800) {
45875094 UniqueArray<long> q;
45885095 UniqueArray<mp_limb_t> inv_primes;
45895096 Unique2DArray<mp_limb_t> tbl;
46275134
46285135 return R.release();
46295136 }
4630
4631
46325137 #endif
46335138
4634 if ( n >= 32 && n <= 256) {
5139 if (n >= 32 && n <= 256) {
46355140 UniqueArray<long> q;
46365141 long i, j;
46375142 long levels, vec_len;
48155320 }
48165321
48175322
4818 #ifdef NTL_TBL_REM
5323 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
48195324
48205325 _ntl_tmp_vec *_ntl_rem_struct_tbl::fetch()
48215326 {
48705375
48715376
48725377
4873 #ifdef NTL_TBL_REM
5378
5379 #if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
48745380
48755381 static inline
48765382 mp_limb_t tbl_red_21(mp_limb_t hi, mp_limb_t lo, long d, mp_limb_t dinv)
48985404 // has exactly NTL_SP_NBITS bits. This will be the case for
48995405 // the FFT primes that are used.
49005406
5407 static inline
5408 mp_limb_t tbl_red_31(mp_limb_t x2, mp_limb_t x1, mp_limb_t x0,
5409 long d, mp_limb_t dinv)
5410 {
5411 mp_limb_t carry = tbl_red_21(x2, x1, d, dinv);
5412 return tbl_red_21(carry, x0, d, dinv);
5413 }
5414
5415 // NOTE: tbl_red_31 assumes x2 < d
5416
49015417
49025418 #if (NTL_SP_NBITS == NTL_BITS_PER_LONG-2)
49035419
49215437 long i;
49225438 for (i = 0; i < n; i++) {
49235439 mp_limb_t *tp = tbl[i];
4924 NTL_ULL_TYPE acc = adata[0];
5440 ll_type acc;
5441 ll_init(acc, adata[0]);
49255442 long j;
49265443 for (j = 1; j < sa; j++)
4927 acc += ((NTL_ULL_TYPE) adata[j]) * ((NTL_ULL_TYPE) tp[j]);
5444 ll_mul_add(acc, adata[j], tp[j]);
49285445
49295446 mp_limb_t accvec[2];
4930 accvec[0] = acc;
4931 accvec[1] = acc >> NTL_ZZ_NBITS;
4932 x[i] = tbl_red_n1(accvec, 2, primes[i], inv_primes[i]);
5447 x[i] = tbl_red_31(0, ll_get_hi(acc), ll_get_lo(acc), primes[i], inv_primes[i]);
49335448 }
49345449 }
49355450 else {
49385453 mp_limb_t *ap = adata;
49395454 mp_limb_t *tp = tbl[i];
49405455
4941 NTL_ULL_TYPE acc21;
5456 ll_type acc21;
49425457 mp_limb_t acc0;
49435458
49445459 {
4945 NTL_ULL_TYPE sum = ap[0];
4946 sum += ((NTL_ULL_TYPE) ap[1]) * ((NTL_ULL_TYPE) tp[1]);
4947 sum += ((NTL_ULL_TYPE) ap[2]) * ((NTL_ULL_TYPE) tp[2]);
4948 sum += ((NTL_ULL_TYPE) ap[3]) * ((NTL_ULL_TYPE) tp[3]);
4949
4950 acc21 = sum >> NTL_BITS_PER_LONG;
4951 acc0 = sum;
5460 ll_type sum;
5461 ll_init(sum, ap[0]);
5462
5463 ll_mul_add(sum, ap[1], tp[1]);
5464 ll_mul_add(sum, ap[2], tp[2]);
5465 ll_mul_add(sum, ap[3], tp[3]);
5466
5467 ll_init(acc21, ll_get_hi(sum));
5468 acc0 = ll_get_lo(sum);
49525469 }
49535470
4954 long m;
4955 for (m = sa-4, ap += 4, tp += 4; m >= 4; m -= 4, ap += 4, tp += 4) {
4956 NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
4957 sum += ((NTL_ULL_TYPE) ap[1]) * ((NTL_ULL_TYPE) tp[1]);
4958 sum += ((NTL_ULL_TYPE) ap[2]) * ((NTL_ULL_TYPE) tp[2]);
4959 sum += ((NTL_ULL_TYPE) ap[3]) * ((NTL_ULL_TYPE) tp[3]);
4960
4961 mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
4962 mp_limb_t sum0 = sum;
4963 NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
4964 mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
4965 acc0 = carry_acc0;
4966 NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
4967 acc21 += x;
5471 long m=sa-4;
5472 ap += 4;
5473 tp += 4;
5474
5475 for (; m >= 8; m -= 8, ap += 8, tp += 8) {
5476 {
5477 ll_type sum;
5478 ll_mul(sum, ap[0], tp[0]);
5479 ll_mul_add(sum, ap[1], tp[1]);
5480 ll_mul_add(sum, ap[2], tp[2]);
5481 ll_mul_add(sum, ap[3], tp[3]);
5482
5483 ll_add(sum, acc0);
5484 acc0 = ll_get_lo(sum);
5485 ll_add(acc21, ll_get_hi(sum));
5486 }
5487 {
5488
5489 ll_type sum;
5490 ll_mul(sum, ap[4+0], tp[4+0]);
5491 ll_mul_add(sum, ap[4+1], tp[4+1]);
5492 ll_mul_add(sum, ap[4+2], tp[4+2]);
5493 ll_mul_add(sum, ap[4+3], tp[4+3]);
5494
5495 ll_add(sum, acc0);
5496 acc0 = ll_get_lo(sum);
5497 ll_add(acc21, ll_get_hi(sum));
5498 }
49685499 }
49695500
5501 for (; m >= 4; m -= 4, ap += 4, tp += 4) {
5502 ll_type sum;
5503 ll_mul(sum, ap[0], tp[0]);
5504 ll_mul_add(sum, ap[1], tp[1]);
5505 ll_mul_add(sum, ap[2], tp[2]);
5506 ll_mul_add(sum, ap[3], tp[3]);
5507
5508 ll_add(sum, acc0);
5509 acc0 = ll_get_lo(sum);
5510 ll_add(acc21, ll_get_hi(sum));
5511 }
5512
49705513 if (m > 0) {
4971 NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
5514 ll_type sum;
5515 ll_mul(sum, ap[0], tp[0]);
49725516 for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
4973 sum += ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
4974
4975 mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
4976 mp_limb_t sum0 = sum;
4977 NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
4978 mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
4979 acc0 = carry_acc0;
4980 NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
4981 acc21 += x;
5517 ll_mul_add(sum, ap[0], tp[0]);
5518
5519
5520 ll_add(sum, acc0);
5521 acc0 = ll_get_lo(sum);
5522 ll_add(acc21, ll_get_hi(sum));
49825523 }
49835524
4984 mp_limb_t accvec[3];
4985 accvec[0] = acc0;
4986 accvec[1] = acc21;
4987 accvec[2] = acc21 >> NTL_BITS_PER_LONG;
4988 x[i] = tbl_red_n1(accvec, 3, primes[i], inv_primes[i]);
5525 x[i] = tbl_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, primes[i], inv_primes[i]);
49895526 }
49905527 }
49915528 }
49925529
49935530 #else
49945531
4995 // General case: no loop unrolling
5532 // General case: some loop unrolling (also using "Duff's Device")
5533 // for the case where BPL-SPNBITS == 4: this is the common
5534 // case on 64-bit machines. The loop unrolling and Duff seems
5535 // to shave off 5-10%
5536
5537 #define TBL_UNROLL (1)
49965538
49975539 // DIRT: won't work if GMP has nails
49985540 void _ntl_rem_struct_tbl::eval(long *x, _ntl_gbigint a,
50135555 long i;
50145556 for (i = 0; i < n; i++) {
50155557 mp_limb_t *tp = tbl[i];
5016 NTL_ULL_TYPE acc = adata[0];
5558
5559
5560 ll_type acc;
5561 ll_init(acc, adata[0]);
5562
5563 #if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
5564 switch (sa) {
5565 case 16: ll_mul_add(acc, adata[16-1], tp[16-1]);
5566 case 15: ll_mul_add(acc, adata[15-1], tp[15-1]);
5567 case 14: ll_mul_add(acc, adata[14-1], tp[14-1]);
5568 case 13: ll_mul_add(acc, adata[13-1], tp[13-1]);
5569 case 12: ll_mul_add(acc, adata[12-1], tp[12-1]);
5570 case 11: ll_mul_add(acc, adata[11-1], tp[11-1]);
5571 case 10: ll_mul_add(acc, adata[10-1], tp[10-1]);
5572 case 9: ll_mul_add(acc, adata[9-1], tp[9-1]);
5573 case 8: ll_mul_add(acc, adata[8-1], tp[8-1]);
5574 case 7: ll_mul_add(acc, adata[7-1], tp[7-1]);
5575 case 6: ll_mul_add(acc, adata[6-1], tp[6-1]);
5576 case 5: ll_mul_add(acc, adata[5-1], tp[5-1]);
5577 case 4: ll_mul_add(acc, adata[4-1], tp[4-1]);
5578 case 3: ll_mul_add(acc, adata[3-1], tp[3-1]);
5579 case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
5580 }
5581
5582 #else
50175583 long j;
50185584 for (j = 1; j < sa; j++)
5019 acc += ((NTL_ULL_TYPE) adata[j]) * ((NTL_ULL_TYPE) tp[j]);
5020
5021 mp_limb_t accvec[2];
5022 accvec[0] = acc;
5023 accvec[1] = acc >> NTL_ZZ_NBITS;
5024 x[i] = tbl_red_n1(accvec, 2, primes[i], inv_primes[i]);
5585 ll_mul_add(acc, adata[j], tp[j]);
5586 #endif
5587
5588 x[i] = tbl_red_31(0, ll_get_hi(acc), ll_get_lo(acc), primes[i], inv_primes[i]);
50255589 }
50265590 }
50275591 else {
50305594 mp_limb_t *ap = adata;
50315595 mp_limb_t *tp = tbl[i];
50325596
5033 NTL_ULL_TYPE acc21;
5597 ll_type acc21;
50345598 mp_limb_t acc0;
50355599
50365600 {
5037 NTL_ULL_TYPE sum = ap[0];
5601 ll_type sum;
5602 ll_init(sum, ap[0]);
5603
5604 #if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
5605 ll_mul_add(sum, ap[1], tp[1]);
5606 ll_mul_add(sum, ap[2], tp[2]);
5607 ll_mul_add(sum, ap[3], tp[3]);
5608 ll_mul_add(sum, ap[4], tp[4]);
5609 ll_mul_add(sum, ap[5], tp[5]);
5610 ll_mul_add(sum, ap[6], tp[6]);
5611 ll_mul_add(sum, ap[7], tp[7]);
5612 ll_mul_add(sum, ap[8], tp[8]);
5613 ll_mul_add(sum, ap[9], tp[9]);
5614 ll_mul_add(sum, ap[10], tp[10]);
5615 ll_mul_add(sum, ap[11], tp[11]);
5616 ll_mul_add(sum, ap[12], tp[12]);
5617 ll_mul_add(sum, ap[13], tp[13]);
5618 ll_mul_add(sum, ap[14], tp[14]);
5619 ll_mul_add(sum, ap[15], tp[15]);
5620 #else
50385621 for (long j = 1; j < Bnd; j++)
5039 sum += ((NTL_ULL_TYPE) ap[j]) * ((NTL_ULL_TYPE) tp[j]);
5040
5041 acc21 = sum >> NTL_BITS_PER_LONG;
5042 acc0 = sum;
5622 ll_mul_add(sum, ap[j], tp[j]);
5623 #endif
5624
5625 ll_init(acc21, ll_get_hi(sum));
5626 acc0 = ll_get_lo(sum);
50435627 }
50445628
50455629 long m;
50465630 for (m = sa-Bnd, ap += Bnd, tp += Bnd; m >= Bnd; m -= Bnd, ap += Bnd, tp += Bnd) {
5047 NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
5631
5632 ll_type sum;
5633 ll_mul(sum, ap[0], tp[0]);
5634
5635 #if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
5636 ll_mul_add(sum, ap[1], tp[1]);
5637 ll_mul_add(sum, ap[2], tp[2]);
5638 ll_mul_add(sum, ap[3], tp[3]);
5639 ll_mul_add(sum, ap[4], tp[4]);
5640 ll_mul_add(sum, ap[5], tp[5]);
5641 ll_mul_add(sum, ap[6], tp[6]);
5642 ll_mul_add(sum, ap[7], tp[7]);
5643 ll_mul_add(sum, ap[8], tp[8]);
5644 ll_mul_add(sum, ap[9], tp[9]);
5645 ll_mul_add(sum, ap[10], tp[10]);
5646 ll_mul_add(sum, ap[11], tp[11]);
5647 ll_mul_add(sum, ap[12], tp[12]);
5648 ll_mul_add(sum, ap[13], tp[13]);
5649 ll_mul_add(sum, ap[14], tp[14]);
5650 ll_mul_add(sum, ap[15], tp[15]);
5651 #else
50485652 for (long j = 1; j < Bnd; j++)
5049 sum += ((NTL_ULL_TYPE) ap[j]) * ((NTL_ULL_TYPE) tp[j]);
5050
5051 mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
5052 mp_limb_t sum0 = sum;
5053 NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
5054 mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
5055 acc0 = carry_acc0;
5056 NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
5057 acc21 += x;
5653 ll_mul_add(sum, ap[j], tp[j]);
5654 #endif
5655 ll_add(sum, acc0);
5656 acc0 = ll_get_lo(sum);
5657 ll_add(acc21, ll_get_hi(sum));
50585658 }
50595659
50605660 if (m > 0) {
5061 NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
5661 ll_type sum;
5662 ll_mul(sum, ap[0], tp[0]);
5663
5664 #if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
5665 switch (m) {
5666 case 15: ll_mul_add(sum, ap[15-1], tp[15-1]);
5667 case 14: ll_mul_add(sum, ap[14-1], tp[14-1]);
5668 case 13: ll_mul_add(sum, ap[13-1], tp[13-1]);
5669 case 12: ll_mul_add(sum, ap[12-1], tp[12-1]);
5670 case 11: ll_mul_add(sum, ap[11-1], tp[11-1]);
5671 case 10: ll_mul_add(sum, ap[10-1], tp[10-1]);
5672 case 9: ll_mul_add(sum, ap[9-1], tp[9-1]);
5673 case 8: ll_mul_add(sum, ap[8-1], tp[8-1]);
5674 case 7: ll_mul_add(sum, ap[7-1], tp[7-1]);
5675 case 6: ll_mul_add(sum, ap[6-1], tp[6-1]);
5676 case 5: ll_mul_add(sum, ap[5-1], tp[5-1]);
5677 case 4: ll_mul_add(sum, ap[4-1], tp[4-1]);
5678 case 3: ll_mul_add(sum, ap[3-1], tp[3-1]);
5679 case 2: ll_mul_add(sum, ap[2-1], tp[2-1]);
5680 }
5681 #else
50625682 for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
5063 sum += ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
5064
5065 mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
5066 mp_limb_t sum0 = sum;
5067 NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
5068 mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
5069 acc0 = carry_acc0;
5070 NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
5071 acc21 += x;
5683 ll_mul_add(sum, ap[0], tp[0]);
5684 #endif
5685 ll_add(sum, acc0);
5686 acc0 = ll_get_lo(sum);
5687 ll_add(acc21, ll_get_hi(sum));
50725688 }
50735689
5074 mp_limb_t accvec[3];
5075 accvec[0] = acc0;
5076 accvec[1] = acc21;
5077 accvec[2] = acc21 >> NTL_BITS_PER_LONG;
5078 x[i] = tbl_red_n1(accvec, 3, primes[i], inv_primes[i]);
5690 x[i] = tbl_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0,
5691 primes[i], inv_primes[i]);
50795692 }
50805693 }
50815694 }
54866099 }
54876100
54886101
5489
5490
6102 // general preconditioned remainder
6103
6104
6105
6106 #ifndef NTL_VIABLE_LL
6107
6108
6109 class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
6110 };
6111
6112 _ntl_general_rem_one_struct *
6113 _ntl_general_rem_one_struct_build(long p, long sz)
6114 {
6115 return 0;
6116 }
6117
6118 long
6119 _ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
6120 {
6121 return _ntl_gsmod(a, p);
6122 }
6123
6124
6125
6126
6127 #else
6128
6129 #define REM_ONE_THRESH (256)
6130
6131 class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
6132 public:
6133 long sz;
6134 sp_ll_reduce_struct red_struct;
6135 long Bnd;
6136 UniqueArray<mp_limb_t> tbl;
6137 };
6138
6139 _ntl_general_rem_one_struct *
6140 _ntl_general_rem_one_struct_build(long p, long sz)
6141 {
6142 if (p < 2 || p >= NTL_SP_BOUND)
6143 LogicError("_ntl_general_rem_one_struct_build: bad args (p)");
6144
6145 if (sz < 0)
6146 LogicError("_ntl_general_rem_one_struct_build: bad args (sz)");
6147
6148 if (sz > REM_ONE_THRESH) sz = REM_ONE_THRESH;
6149
6150 if (sz == 0) return 0;
6151
6152
6153 UniquePtr<_ntl_general_rem_one_impl> ptr;
6154 ptr.make();
6155
6156 ptr->sz = sz;
6157
6158 ptr->red_struct = make_sp_ll_reduce_struct(p);
6159
6160 ptr->Bnd = 1L << (NTL_BITS_PER_LONG-_ntl_g2logs(p));
6161
6162 ptr->tbl.SetLength(sz);
6163
6164 long t = 1;
6165 for (long j = 0; j < NTL_ZZ_NBITS; j++) {
6166 t += t;
6167 if (t >= p) t -= p;
6168 }
6169
6170 long t1 = 1;
6171 ptr->tbl[0] = 1;
6172 for (long j = 1; j < sz; j++) {
6173 t1 = MulMod(t1, t, p);
6174 ptr->tbl[j] = t1;
6175 }
6176
6177 return ptr.release();
6178 }
6179
6180
6181
6182
6183 long
6184 _ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
6185 {
6186 if (ZEROP(a)) return 0;
6187
6188 if (!pinfo) {
6189 return _ntl_gsmod(a, p);
6190 }
6191
6192 _ntl_general_rem_one_impl *ptr = (_ntl_general_rem_one_impl *) pinfo;
6193
6194
6195 long sz = ptr->sz;
6196 sp_ll_reduce_struct red_struct = ptr->red_struct;
6197 long Bnd = ptr->Bnd;
6198 mp_limb_t *tbl = ptr->tbl.elts();
6199
6200 long a_sz, a_neg;
6201 mp_limb_t *a_data;
6202 GET_SIZE_NEG(a_sz, a_neg, a);
6203 a_data = DATA(a);
6204
6205 if (a_sz > sz) {
6206 long res = mpn_mod_1(a_data, a_sz, p);
6207 if (a_neg) res = NegateMod(res, p);
6208 return res;
6209 }
6210 else if (a_sz <= Bnd) {
6211 ll_type acc;
6212 ll_init(acc, a_data[0]);
6213
6214 {
6215 long j = 1;
6216
6217 for (; j <= a_sz-16; j += 16) {
6218 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6219 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6220 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6221 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6222 ll_mul_add(acc, a_data[j+4], tbl[j+4]);
6223 ll_mul_add(acc, a_data[j+5], tbl[j+5]);
6224 ll_mul_add(acc, a_data[j+6], tbl[j+6]);
6225 ll_mul_add(acc, a_data[j+7], tbl[j+7]);
6226 ll_mul_add(acc, a_data[j+8], tbl[j+8]);
6227 ll_mul_add(acc, a_data[j+9], tbl[j+9]);
6228 ll_mul_add(acc, a_data[j+10], tbl[j+10]);
6229 ll_mul_add(acc, a_data[j+11], tbl[j+11]);
6230 ll_mul_add(acc, a_data[j+12], tbl[j+12]);
6231 ll_mul_add(acc, a_data[j+13], tbl[j+13]);
6232 ll_mul_add(acc, a_data[j+14], tbl[j+14]);
6233 ll_mul_add(acc, a_data[j+15], tbl[j+15]);
6234 }
6235
6236 for (; j <= a_sz-4; j += 4) {
6237 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6238 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6239 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6240 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6241 }
6242
6243 for (; j < a_sz; j++)
6244 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6245 }
6246
6247
6248 long res = sp_ll_red_31(0, ll_get_hi(acc), ll_get_lo(acc), p, red_struct);
6249 if (a_neg) res = NegateMod(res, p);
6250 return res;
6251 }
6252 else if (Bnd > 16) {
6253 ll_type acc21;
6254 ll_init(acc21, 0);
6255 mp_limb_t acc0 = 0;
6256
6257 long jj = 0;
6258 for (; jj <= a_sz-Bnd; jj += Bnd) {
6259 ll_type acc;
6260 ll_init(acc, acc0);
6261
6262 long j = jj;
6263
6264 for (; j <= jj+Bnd-16; j += 16) {
6265 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6266 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6267 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6268 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6269 ll_mul_add(acc, a_data[j+4], tbl[j+4]);
6270 ll_mul_add(acc, a_data[j+5], tbl[j+5]);
6271 ll_mul_add(acc, a_data[j+6], tbl[j+6]);
6272 ll_mul_add(acc, a_data[j+7], tbl[j+7]);
6273 ll_mul_add(acc, a_data[j+8], tbl[j+8]);
6274 ll_mul_add(acc, a_data[j+9], tbl[j+9]);
6275 ll_mul_add(acc, a_data[j+10], tbl[j+10]);
6276 ll_mul_add(acc, a_data[j+11], tbl[j+11]);
6277 ll_mul_add(acc, a_data[j+12], tbl[j+12]);
6278 ll_mul_add(acc, a_data[j+13], tbl[j+13]);
6279 ll_mul_add(acc, a_data[j+14], tbl[j+14]);
6280 ll_mul_add(acc, a_data[j+15], tbl[j+15]);
6281 }
6282
6283 acc0 = ll_get_lo(acc);
6284 ll_add(acc21, ll_get_hi(acc));
6285 }
6286
6287 if (jj < a_sz) {
6288 ll_type acc;
6289 ll_init(acc, acc0);
6290
6291 long j = jj;
6292
6293 for (; j <= a_sz-4; j += 4) {
6294 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6295 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6296 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6297 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6298 }
6299
6300 for (; j < a_sz; j++)
6301 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6302
6303 acc0 = ll_get_lo(acc);
6304 ll_add(acc21, ll_get_hi(acc));
6305 }
6306
6307 long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
6308 if (a_neg) res = NegateMod(res, p);
6309 return res;
6310 }
6311 else if (Bnd == 16) {
6312 ll_type acc21;
6313 ll_init(acc21, 0);
6314 mp_limb_t acc0 = 0;
6315
6316 long jj = 0;
6317 for (; jj <= a_sz-16; jj += 16) {
6318 ll_type acc;
6319
6320 long j = jj;
6321
6322 ll_mul(acc, a_data[j+0], tbl[j+0]);
6323 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6324 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6325 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6326 ll_mul_add(acc, a_data[j+4], tbl[j+4]);
6327 ll_mul_add(acc, a_data[j+5], tbl[j+5]);
6328 ll_mul_add(acc, a_data[j+6], tbl[j+6]);
6329 ll_mul_add(acc, a_data[j+7], tbl[j+7]);
6330 ll_mul_add(acc, a_data[j+8], tbl[j+8]);
6331 ll_mul_add(acc, a_data[j+9], tbl[j+9]);
6332 ll_mul_add(acc, a_data[j+10], tbl[j+10]);
6333 ll_mul_add(acc, a_data[j+11], tbl[j+11]);
6334 ll_mul_add(acc, a_data[j+12], tbl[j+12]);
6335 ll_mul_add(acc, a_data[j+13], tbl[j+13]);
6336 ll_mul_add(acc, a_data[j+14], tbl[j+14]);
6337 ll_mul_add(acc, a_data[j+15], tbl[j+15]);
6338
6339 ll_add(acc, acc0);
6340 acc0 = ll_get_lo(acc);
6341 ll_add(acc21, ll_get_hi(acc));
6342 }
6343
6344 if (jj < a_sz) {
6345 ll_type acc;
6346 ll_init(acc, acc0);
6347
6348 long j = jj;
6349
6350 for (; j <= a_sz-4; j += 4) {
6351 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6352 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6353 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6354 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6355 }
6356
6357 for (; j < a_sz; j++)
6358 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6359
6360 acc0 = ll_get_lo(acc);
6361 ll_add(acc21, ll_get_hi(acc));
6362 }
6363
6364 long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
6365 if (a_neg) res = NegateMod(res, p);
6366 return res;
6367 }
6368 else if (Bnd == 8) {
6369 ll_type acc21;
6370 ll_init(acc21, 0);
6371 mp_limb_t acc0 = 0;
6372
6373 long jj = 0;
6374 for (; jj <= a_sz-8; jj += 8) {
6375 ll_type acc;
6376
6377 long j = jj;
6378
6379 ll_mul(acc, a_data[j+0], tbl[j+0]);
6380 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6381 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6382 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6383 ll_mul_add(acc, a_data[j+4], tbl[j+4]);
6384 ll_mul_add(acc, a_data[j+5], tbl[j+5]);
6385 ll_mul_add(acc, a_data[j+6], tbl[j+6]);
6386 ll_mul_add(acc, a_data[j+7], tbl[j+7]);
6387
6388 ll_add(acc, acc0);
6389 acc0 = ll_get_lo(acc);
6390 ll_add(acc21, ll_get_hi(acc));
6391 }
6392
6393 if (jj < a_sz) {
6394 ll_type acc;
6395 ll_init(acc, acc0);
6396
6397 long j = jj;
6398
6399 for (; j < a_sz; j++)
6400 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6401
6402 acc0 = ll_get_lo(acc);
6403 ll_add(acc21, ll_get_hi(acc));
6404 }
6405
6406 long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
6407 if (a_neg) res = NegateMod(res, p);
6408 return res;
6409 }
6410 else /* Bnd == 4 */ {
6411 ll_type acc21;
6412 ll_init(acc21, 0);
6413 mp_limb_t acc0 = 0;
6414
6415 long jj = 0;
6416 for (; jj <= a_sz-4; jj += 4) {
6417 ll_type acc;
6418
6419 long j = jj;
6420
6421 ll_mul(acc, a_data[j+0], tbl[j+0]);
6422 ll_mul_add(acc, a_data[j+1], tbl[j+1]);
6423 ll_mul_add(acc, a_data[j+2], tbl[j+2]);
6424 ll_mul_add(acc, a_data[j+3], tbl[j+3]);
6425
6426
6427 ll_add(acc, acc0);
6428 acc0 = ll_get_lo(acc);
6429 ll_add(acc21, ll_get_hi(acc));
6430 }
6431
6432 if (jj < a_sz) {
6433 ll_type acc;
6434 ll_init(acc, acc0);
6435
6436 long j = jj;
6437
6438 for (; j < a_sz; j++)
6439 ll_mul_add(acc, a_data[j+0], tbl[j+0]);
6440
6441
6442 acc0 = ll_get_lo(acc);
6443 ll_add(acc21, ll_get_hi(acc));
6444 }
6445
6446 long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
6447 if (a_neg) res = NegateMod(res, p);
6448 return res;
6449 }
6450 }
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460 #endif
6461
6462
6463
33 #include <NTL/new.h>
44
55 NTL_START_IMPL
6
7
8 NTL_TLS_GLOBAL_DECL(SmartPtr<zz_pInfoT>, zz_pInfo_stg)
9
10 NTL_CHEAP_THREAD_LOCAL zz_pInfoT *zz_pInfo = 0;
11
12
613
714 SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info)
815 {
2532 p = NewP;
2633 pinv = PrepMulMod(p);
2734 red_struct = sp_PrepRem(p);
35 ll_red_struct = make_sp_ll_reduce_struct(p);
2836
2937 p_info = 0;
3038
8189 p = info->q;
8290 pinv = info->qinv;
8391 red_struct = sp_PrepRem(p);
92 ll_red_struct = make_sp_ll_reduce_struct(p);
8493
8594
8695 p_info = info;
101110 p = q;
102111 pinv = PrepMulMod(p);
103112 red_struct = sp_PrepRem(p);
113 ll_red_struct = make_sp_ll_reduce_struct(p);
104114
105115
106116 p_info_owner.make();
119129 }
120130
121131
122
123 NTL_THREAD_LOCAL SmartPtr<zz_pInfoT> zz_pInfo = 0;
124
125
126
127132 void zz_p::init(long p, long maxroot)
128133 {
129134 zz_pContext c(p, maxroot);
164169
165170 void zz_pContext::save()
166171 {
167 ptr = zz_pInfo;
172 NTL_TLS_GLOBAL_ACCESS(zz_pInfo_stg);
173 ptr = zz_pInfo_stg;
168174 }
169175
170176 void zz_pContext::restore() const
171177 {
172 zz_pInfo = ptr;
178 NTL_TLS_GLOBAL_ACCESS(zz_pInfo_stg);
179 zz_pInfo_stg = ptr;
180 zz_pInfo = zz_pInfo_stg.get();
173181 }
174182
175183
225233 return s;
226234 }
227235
236
237
238 // ***********************************************************************
239
240
241 #ifdef NTL_HAVE_LL_TYPE
242
243
244 // NOTE: the following code sequence will generate imulq
245 // instructions on x86_64 machines, which empirically is faster
246 // than using the mulq instruction or even the mulxq instruction,
247 // (tested on a Haswell machine).
248
249 long
250 InnerProd_LL(const long *ap, const zz_p *bp, long n, long d,
251 sp_ll_reduce_struct dinv)
252 {
253 const long BLKSIZE = (1L << min(20, 2*(NTL_BITS_PER_LONG-NTL_SP_NBITS)));
254
255 unsigned long acc0 = 0;
256 ll_type acc21;
257 ll_init(acc21, 0);
258
259 long i;
260 for (i = 0; i <= n-BLKSIZE; i += BLKSIZE, ap += BLKSIZE, bp += BLKSIZE) {
261 // sum ap[j]*rep(bp[j]) for j in [0..BLKSIZE)
262
263 ll_type sum;
264 ll_init(sum, 0);
265 for (long j = 0; j < BLKSIZE; j += 4) {
266 ll_imul_add(sum, ap[j+0], rep(bp[j+0]));
267 ll_imul_add(sum, ap[j+1], rep(bp[j+1]));
268 ll_imul_add(sum, ap[j+2], rep(bp[j+2]));
269 ll_imul_add(sum, ap[j+3], rep(bp[j+3]));
270 }
271
272 ll_add(sum, acc0);
273 acc0 = ll_get_lo(sum);
274 ll_add(acc21, ll_get_hi(sum));
275 }
276
277 if (i < n) {
278 // sum ap[i]*rep(bp[j]) for j in [0..n-i)
279
280 ll_type sum;
281 ll_init(sum, 0);
282 long j = 0;
283 for (; j <= n-i-4; j += 4) {
284 ll_imul_add(sum, ap[j+0], rep(bp[j+0]));
285 ll_imul_add(sum, ap[j+1], rep(bp[j+1]));
286 ll_imul_add(sum, ap[j+2], rep(bp[j+2]));
287 ll_imul_add(sum, ap[j+3], rep(bp[j+3]));
288 }
289
290 for (; j < n-i; j++)
291 ll_imul_add(sum, ap[j], rep(bp[j]));
292
293
294 ll_add(sum, acc0);
295 acc0 = ll_get_lo(sum);
296 ll_add(acc21, ll_get_hi(sum));
297 }
298
299 if (dinv.nbits == NTL_SP_NBITS)
300 return sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
301 else
302 return sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
303 }
304
305
306 long
307 InnerProd_LL(const zz_p *ap, const zz_p *bp, long n, long d,
308 sp_ll_reduce_struct dinv)
309 {
310 const long BLKSIZE = (1L << min(20, 2*(NTL_BITS_PER_LONG-NTL_SP_NBITS)));
311
312 unsigned long acc0 = 0;
313 ll_type acc21;
314 ll_init(acc21, 0);
315
316 long i;
317 for (i = 0; i <= n-BLKSIZE; i += BLKSIZE, ap += BLKSIZE, bp += BLKSIZE) {
318 // sum ap[j]*rep(bp[j]) for j in [0..BLKSIZE)
319
320 ll_type sum;
321 ll_init(sum, 0);
322 for (long j = 0; j < BLKSIZE; j += 4) {
323 ll_imul_add(sum, rep(ap[j+0]), rep(bp[j+0]));
324 ll_imul_add(sum, rep(ap[j+1]), rep(bp[j+1]));
325 ll_imul_add(sum, rep(ap[j+2]), rep(bp[j+2]));
326 ll_imul_add(sum, rep(ap[j+3]), rep(bp[j+3]));
327 }
328
329 ll_add(sum, acc0);
330 acc0 = ll_get_lo(sum);
331 ll_add(acc21, ll_get_hi(sum));
332 }
333
334 if (i < n) {
335 // sum ap[i]*rep(bp[j]) for j in [0..n-i)
336
337 ll_type sum;
338 ll_init(sum, 0);
339 long j = 0;
340 for (; j <= n-i-4; j += 4) {
341 ll_imul_add(sum, rep(ap[j+0]), rep(bp[j+0]));
342 ll_imul_add(sum, rep(ap[j+1]), rep(bp[j+1]));
343 ll_imul_add(sum, rep(ap[j+2]), rep(bp[j+2]));
344 ll_imul_add(sum, rep(ap[j+3]), rep(bp[j+3]));
345 }
346
347 for (; j < n-i; j++)
348 ll_imul_add(sum, rep(ap[j]), rep(bp[j]));
349
350
351 ll_add(sum, acc0);
352 acc0 = ll_get_lo(sum);
353 ll_add(acc21, ll_get_hi(sum));
354 }
355
356 if (dinv.nbits == NTL_SP_NBITS)
357 return sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
358 else
359 return sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
360 }
361
362
363 long
364 InnerProd_L(const long *ap, const zz_p *bp, long n, long d,
365 sp_reduce_struct dinv)
366 {
367 unsigned long sum = 0;
368 long j = 0;
369
370 for (; j <= n-4; j += 4) {
371 sum += (ap[j+0]) * (rep(bp[j+0]));
372 sum += (ap[j+1]) * (rep(bp[j+1]));
373 sum += (ap[j+2]) * (rep(bp[j+2]));
374 sum += (ap[j+3]) * (rep(bp[j+3]));
375 }
376
377 for (; j < n; j++)
378 sum += (ap[j]) * (rep(bp[j]));
379
380 return rem(sum, d, dinv);
381 }
382
383 long
384 InnerProd_L(const zz_p *ap, const zz_p *bp, long n, long d,
385 sp_reduce_struct dinv)
386 {
387 unsigned long sum = 0;
388 long j = 0;
389
390 for (; j <= n-4; j += 4) {
391 sum += (rep(ap[j+0])) * (rep(bp[j+0]));
392 sum += (rep(ap[j+1])) * (rep(bp[j+1]));
393 sum += (rep(ap[j+2])) * (rep(bp[j+2]));
394 sum += (rep(ap[j+3])) * (rep(bp[j+3]));
395 }
396
397 for (; j < n; j++)
398 sum += (rep(ap[j])) * (rep(bp[j]));
399
400 return rem(sum, d, dinv);
401 }
402
403 #endif
404
405
406
228407 NTL_END_IMPL
44 #include <NTL/new.h>
55
66 NTL_START_IMPL
7
8
9 NTL_TLS_GLOBAL_DECL(SmartPtr<zz_pEInfoT>, zz_pEInfo_stg)
10
11 NTL_CHEAP_THREAD_LOCAL zz_pEInfoT *zz_pEInfo = 0;
12
713
814 zz_pEInfoT::zz_pEInfoT(const zz_pX& NewP)
915 {
3440
3541
3642
37 NTL_THREAD_LOCAL SmartPtr<zz_pEInfoT> zz_pEInfo = 0;
38
39
4043 void zz_pE::init(const zz_pX& p)
4144 {
4245 zz_pEContext c(p);
4649
4750 void zz_pEContext::save()
4851 {
49 ptr = zz_pEInfo;
52 NTL_TLS_GLOBAL_ACCESS(zz_pEInfo_stg);
53 ptr = zz_pEInfo_stg;
5054 }
5155
5256 void zz_pEContext::restore() const
5357 {
54 zz_pEInfo = ptr;
58 NTL_TLS_GLOBAL_ACCESS(zz_pEInfo_stg);
59 zz_pEInfo_stg = ptr;
60 zz_pEInfo = zz_pEInfo_stg.get();
5561 }
5662
5763
7783
7884 const zz_pE& zz_pE::zero()
7985 {
80 NTL_THREAD_LOCAL static zz_pE z(INIT_NO_ALLOC);
86 static const zz_pE z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
8187 return z;
8288 }
8389
1212
1313 const zz_pEX& zz_pEX::zero()
1414 {
15 NTL_THREAD_LOCAL static zz_pEX z;
15 static const zz_pEX z; // GLOBAL (assumes C++11 thread-safe init)
1616 return z;
1717 }
1818
22142214 MulMod(A.H[i], A.H[i-1], h, F);
22152215 }
22162216
2217 NTL_THREAD_LOCAL long zz_pEXArgBound = 0;
2217 NTL_CHEAP_THREAD_LOCAL long zz_pEXArgBound = 0;
22182218
22192219
22202220
356356 }
357357
358358
359 NTL_THREAD_LOCAL long zz_pEX_BlockingFactor = 10;
359 NTL_CHEAP_THREAD_LOCAL long zz_pEX_BlockingFactor = 10;
360360
361361
362362
10601060
10611061 /************* NEW DDF ****************/
10621062
1063 NTL_THREAD_LOCAL long zz_pEX_GCDTableSize = 4;
1064 NTL_THREAD_LOCAL double zz_pEXFileThresh = NTL_FILE_THRESH;
1065 NTL_THREAD_LOCAL static vec_zz_pEX *BabyStepFile=0;
1066 NTL_THREAD_LOCAL static vec_zz_pEX *GiantStepFile=0;
1067 NTL_THREAD_LOCAL static long use_files;
1063 NTL_CHEAP_THREAD_LOCAL long zz_pEX_GCDTableSize = 4;
1064 NTL_CHEAP_THREAD_LOCAL double zz_pEXFileThresh = NTL_FILE_THRESH;
1065 static NTL_CHEAP_THREAD_LOCAL vec_zz_pEX *BabyStepFile=0;
1066 static NTL_CHEAP_THREAD_LOCAL vec_zz_pEX *GiantStepFile=0;
1067 static NTL_CHEAP_THREAD_LOCAL long use_files;
10681068
10691069
10701070 static
2121
2222 const zz_pX& zz_pX::zero()
2323 {
24 NTL_THREAD_LOCAL static zz_pX z;
24 static const zz_pX z; // GLOBAL (assumes C++11 thread-safe init)
2525 return z;
2626 }
2727
14651465 }
14661466
14671467
1468 #if 0
1469 // converts entries lo..lo+cnt-1 in R and stores results into res
1470 static
1471 void FromModularRep(zz_p* res, const fftRep& R, long lo, long cnt,
1472 zz_pInfoT* info)
1473 {
1474 if (cnt <= 0) return;
1475
1476 long nprimes = info->NumPrimes;
1477 long p = info->p;
1478 mulmod_t pinv = info->pinv;
1479 long *CoeffModP = info->CoeffModP.elts();
1480 double *x = info->x.elts();
1481 long *u = info->u.elts();
1482 mulmod_precon_t *uqinv = info->uqinv.elts();
1483 long MinusMModP = info->MinusMModP;
1484 mulmod_precon_t MinusMModPpinv = info->MinusMModPpinv;
1485 mulmod_precon_t *CoeffModPpinv = info->CoeffModPpinv.elts();
1486
1487 long primes[4];
1488 double prime_recip[4];
1489 long *tbl[4];
1490
1491 long q, s, t;
1492 long i, j;
1493 double y;
1494
1495 for (i = 0; i < nprimes; i++) {
1496 primes[i] = GetFFTPrime(i);
1497 prime_recip[i] = GetFFTPrimeRecip(i);
1498 tbl[i] = R.tbl[i].get();
1499 }
1500
1501 for (j = 0; j < cnt; j++) {
1502 y = double(0L);
1503 t = 0;
1504
1505 for (i = 0; i < nprimes; i++) {
1506 s = MulModPrecon(tbl[i][j+lo], u[i], primes[i], uqinv[i]);
1507 y = y + double(s)*prime_recip[i];
1508
1509
1510 // DIRT: uses undocumented MulMod feature (see sp_arith.h)
1511 // input s is not reduced mod p
1512 s = MulModPrecon(s, CoeffModP[i], p, CoeffModPpinv[i]);
1513
1514 t = AddMod(t, s, p);
1515 }
1516
1517 q = (long) (y + 0.5);
1518
1519 // DIRT: uses undocumented MulMod feature (see sp_arith.h)
1520 // input q may not be reduced mod p
1521 s = MulModPrecon(q, MinusMModP, p, MinusMModPpinv);
1522
1523 t = AddMod(t, s, p);
1524 res[j].LoopHole() = t;
1525 }
1526
1527 }
1528 #else
1529
1530 #define NTL_FMR_LOOP_BODY(i) \
1531 s = MulModPrecon(tbl[i][j+lo], u[i], primes[i], uqinv[i]);\
1532 y = y + double(s)*prime_recip[i];\
1533 \
1534 \
1535 /* DIRT: uses undocumented MulMod feature (see sp_arith.h) */\
1536 /* input s is not reduced mod p */\
1537 s = MulModPrecon(s, CoeffModP[i], p, CoeffModPpinv[i]);\
1538 \
1539 t = AddMod(t, s, p);\
1540
1541
1542 #define NTL_FMP_OUTER_LOOP(XXX) \
1543 for (j = 0; j < cnt; j++) {\
1544 y = double(0L);\
1545 t = 0;\
1546 XXX \
1547 q = (long) (y + 0.5);\
1548 /* DIRT: uses undocumented MulMod feature (see sp_arith.h) */\
1549 /* input q may not be reduced mod p */\
1550 s = MulModPrecon(q, MinusMModP, p, MinusMModPpinv);\
1551 t = AddMod(t, s, p);\
1552 res[j].LoopHole() = t;\
1553 }\
1554
1555
1556
1557 // converts entries lo..lo+cnt-1 in R and stores results into res
1558 static
1559 void FromModularRep(zz_p* res, const fftRep& R, long lo, long cnt,
1560 zz_pInfoT* info)
1561 {
1562 if (cnt <= 0) return;
1563
1564 long nprimes = info->NumPrimes;
1565 long p = info->p;
1566 mulmod_t pinv = info->pinv;
1567 long *CoeffModP = info->CoeffModP.elts();
1568 double *x = info->x.elts();
1569 long *u = info->u.elts();
1570 mulmod_precon_t *uqinv = info->uqinv.elts();
1571 long MinusMModP = info->MinusMModP;
1572 mulmod_precon_t MinusMModPpinv = info->MinusMModPpinv;
1573 mulmod_precon_t *CoeffModPpinv = info->CoeffModPpinv.elts();
1574
1575 long primes[4];
1576 double prime_recip[4];
1577 long *tbl[4];
1578
1579 long q, s, t;
1580 long i, j;
1581 double y;
1582
1583 for (i = 0; i < nprimes; i++) {
1584 primes[i] = GetFFTPrime(i);
1585 prime_recip[i] = GetFFTPrimeRecip(i);
1586 tbl[i] = R.tbl[i].get();
1587 }
1588
1589 if (nprimes == 1) {
1590 long *tbl_0 = tbl[0];
1591 mulmod_precon_t CoeffModPpinv_0 = CoeffModPpinv[0];
1592 long primes_0 = primes[0];
1593 long hp0 = primes_0 >> 1;
1594
1595 for (j = 0; j < cnt; j++) {
1596 s = tbl_0[j+lo];
1597
1598 // DIRT: uses undocumented MulMod feature (see sp_arith.h)
1599 // input s is not reduced mod p
1600 t = MulModPrecon(s, 1, p, CoeffModPpinv_0);
1601
1602 res[j].LoopHole() = AddMod(t, sp_SignMask(hp0-s) & MinusMModP, p);
1603 }
1604 }
1605 else if (nprimes == 2) {
1606 NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) )
1607 }
1608 else if (nprimes == 3) {
1609 NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) NTL_FMR_LOOP_BODY(2) )
1610 }
1611 else { // nprimes == 4
1612 NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) NTL_FMR_LOOP_BODY(2) NTL_FMR_LOOP_BODY(3) )
1613 }
1614 }
1615
1616
1617
1618
1619 #endif
1620
1621
1622
14681623
14691624 void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
14701625 // computes an n = 2^k point convolution.
14711626 // if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
14721627 {
1473 zz_pInfoT *info = zz_pInfo.get();
1628 zz_pInfoT *info = zz_pInfo;
14741629 long p = info->p;
14751630
14761631 long n, i, j, m, j1;
14771632 long accum;
1478 long NumPrimes = info->NumPrimes;
1633 long nprimes = info->NumPrimes;
14791634
14801635
14811636 if (k > info->MaxRoot)
14971652 FFTPrimeInfo *p_info = info->p_info;
14981653
14991654 if (p_info) {
1500 for (j = 0; j < n; j++) {
1501 if (j >= m) {
1502 y.tbl[0][j] = 0;
1655 if (n >= m) {
1656 long *yp = &y.tbl[0][0];
1657 for (j = 0; j < m; j++) {
1658 yp[j] = rep(xx[j+lo]);
15031659 }
1504 else {
1660 for (j = m; j < n; j++) {
1661 yp[j] = 0;
1662 }
1663 }
1664 else {
1665 for (j = 0; j < n; j++) {
15051666 accum = rep(xx[j+lo]);
15061667 for (j1 = j + n; j1 < m; j1 += n)
15071668 accum = AddMod(accum, rep(xx[j1+lo]), p);
15101671 }
15111672 }
15121673 else {
1513 for (j = 0; j < n; j++) {
1514 if (j >= m) {
1515 for (i = 0; i < NumPrimes; i++)
1516 y.tbl[i][j] = 0;
1674 if (n >= m) {
1675 for (i = 0; i < nprimes; i++) {
1676 long q = GetFFTPrime(i);
1677 long *yp = &y.tbl[i][0];
1678 for (j = 0; j < m; j++) {
1679 long t = rep(xx[j+lo]);
1680 t = sp_CorrectExcess(t, q);
1681 yp[j] = t;
1682 }
1683 for (j = m; j < n; j++) {
1684 yp[j] = 0;
1685 }
15171686 }
1518 else {
1687 }
1688 else {
1689 for (j = 0; j < n; j++) {
15191690 accum = rep(xx[j+lo]);
15201691 for (j1 = j + n; j1 < m; j1 += n)
15211692 accum = AddMod(accum, rep(xx[j1+lo]), p);
1522 for (i = 0; i < NumPrimes; i++) {
1693 for (i = 0; i < nprimes; i++) {
15231694 long q = GetFFTPrime(i);
15241695 long t = accum;
1525 if (t >= q) t -= q;
1696 t = sp_CorrectExcess(t, q);
15261697 y.tbl[i][j] = t;
15271698 }
15281699 }
15351706 FFTFwd(yp, yp, k, *p_info);
15361707 }
15371708 else {
1538 for (i = 0; i < info->NumPrimes; i++) {
1709 for (i = 0; i < nprimes; i++) {
15391710 long *yp = &y.tbl[i][0];
15401711 FFTFwd(yp, yp, k, i);
15411712 }
15501721 // using "inverted" evaluation points.
15511722
15521723 {
1553 zz_pInfoT *info = zz_pInfo.get();
1724 zz_pInfoT *info = zz_pInfo;
15541725 long p = info->p;
15551726
15561727 long n, i, j, m, j1;
16041775 for (i = 0; i < NumPrimes; i++) {
16051776 long q = GetFFTPrime(i);
16061777 long t = accum;
1607 if (t >= q) t -= q;
1778 t = sp_CorrectExcess(t, q);
16081779 y.tbl[i][offset] = t;
16091780 }
16101781 }
16321803
16331804
16341805 {
1635 zz_pInfoT *info = zz_pInfo.get();
1806 zz_pInfoT *info = zz_pInfo;
16361807
16371808 long k, n, i, j, l;
16381809 long NumPrimes = info->NumPrimes;
16391810
1640 long t[4];
16411811
16421812 k = y.k;
16431813 n = (1L << k);
16671837 xp[j].LoopHole() = yp[j+lo];
16681838 }
16691839 else {
1670 for (j = 0; j < l; j++) {
1671 for (i = 0; i < NumPrimes; i++)
1672 t[i] = y.tbl[i][j+lo];
1673
1674 FromModularRep(x.rep[j], t, info);
1675 }
1840 FromModularRep(x.rep.elts(), y, lo, l, info);
16761841 }
16771842
16781843 x.normalize();
16861851
16871852
16881853 {
1689 zz_pInfoT *info = zz_pInfo.get();
1854 zz_pInfoT *info = zz_pInfo;
16901855
16911856 long k, n, i, j, l;
16921857 long NumPrimes = info->NumPrimes;
16931858
1694 long t[4];
16951859
16961860 k = y.k;
16971861 n = (1L << k);
17211885 xp[j].LoopHole() = yp[j+lo];
17221886 }
17231887 else {
1724 for (j = 0; j < l; j++) {
1725 for (i = 0; i < NumPrimes; i++)
1726 t[i] = y.tbl[i][j+lo];
1727
1728 FromModularRep(x[j], t, info);
1729 }
1888 FromModularRep(x.elts(), y, lo, l, info);
17301889 }
17311890 }
17321891
17331892 void NDFromfftRep(zz_pX& x, const fftRep& y, long lo, long hi, fftRep& z)
17341893 {
1735 zz_pInfoT *info = zz_pInfo.get();
1894 zz_pInfoT *info = zz_pInfo;
17361895
17371896 long k, n, i, j, l;
17381897 long NumPrimes = info->NumPrimes;
17391898
1740 long t[4];
17411899
17421900 k = y.k;
17431901 n = (1L << k);
17711929 xp[j].LoopHole() = zp[j+lo];
17721930 }
17731931 else {
1774 for (j = 0; j < l; j++) {
1775 for (i = 0; i < NumPrimes; i++)
1776 t[i] = z.tbl[i][j+lo];
1777
1778 FromModularRep(x.rep[j], t, info);
1779 }
1932 FromModularRep(x.rep.elts(), z, lo, l, info);
17801933 }
17811934
17821935 x.normalize();
17951948
17961949
17971950 {
1798 zz_pInfoT *info = zz_pInfo.get();
1951 zz_pInfoT *info = zz_pInfo;
17991952
18001953 long k, n, i, j;
18011954 long NumPrimes = info->NumPrimes;
18021955
1803 long t[4];
18041956
18051957 k = y.k;
18061958 n = (1L << k);
18241976 long *yp = &y.tbl[i][0];
18251977 FFTRev1(yp, yp, k, i);
18261978 }
1979
1980 // take coefficients lo..min(hi, n-1) from y
1981 // zero out coefficients max(n, lo)..hi
18271982
1828 for (j = lo; j <= hi; j++) {
1829 if (j >= n)
1830 clear(x[j-lo]);
1831 else {
1832 for (i = 0; i < info->NumPrimes; i++)
1833 t[i] = y.tbl[i][j];
1834
1835 FromModularRep(x[j-lo], t, info);
1836 }
1837 }
1983 long l = min(hi, n-1) - lo + 1;
1984 l = max(l, 0);
1985 FromModularRep(x, y, lo, l, info);
1986 for (j = max(n, lo); j <= hi; j++) clear(x[j-lo]);
18381987 }
18391988 }
18401989
18411990
18421991 void mul(fftRep& z, const fftRep& x, const fftRep& y)
18431992 {
1844 zz_pInfoT *info = zz_pInfo.get();
1993 zz_pInfoT *info = zz_pInfo;
18451994
18461995 long k, n, i, j;
18471996
18862035
18872036 void sub(fftRep& z, const fftRep& x, const fftRep& y)
18882037 {
1889 zz_pInfoT *info = zz_pInfo.get();
2038 zz_pInfoT *info = zz_pInfo;
18902039
18912040 long k, n, i, j;
18922041
19232072
19242073 void add(fftRep& z, const fftRep& x, const fftRep& y)
19252074 {
1926 zz_pInfoT *info = zz_pInfo.get();
2075 zz_pInfoT *info = zz_pInfo;
19272076
19282077 long k, n, i, j;
19292078
19632112 // reduces a 2^l point FFT-rep to a 2^k point FFT-rep
19642113 // input may alias output
19652114 {
1966 zz_pInfoT *info = zz_pInfo.get();
2115 zz_pInfoT *info = zz_pInfo;
19672116
19682117 long i, j, l, n;
19692118 long* xp;
19872136 void AddExpand(fftRep& x, const fftRep& a)
19882137 // x = x + (an "expanded" version of a)
19892138 {
1990 zz_pInfoT *info = zz_pInfo.get();
2139 zz_pInfoT *info = zz_pInfo;
19912140
19922141 long i, j, l, k, n;
19932142
0
10
21 #include <NTL/lzz_pX.h>
3
42 #include <NTL/new.h>
3
4 #ifdef NTL_HAVE_AVX
5 #include <immintrin.h>
6 #endif
7
58
69 NTL_START_IMPL
710
989992
990993
991994
992 NTL_THREAD_LOCAL long zz_pXArgBound = 0;
995 NTL_CHEAP_THREAD_LOCAL long zz_pXArgBound = 0;
993996
994997
995998 void CompMod(zz_pX& x, const zz_pX& g, const zz_pX& h, const zz_pXModulus& F)
10651068 x2 = xx2;
10661069 x3 = xx3;
10671070 }
1071
1072
1073 // BEGIN zz_pXAltArgument variation
1074
1075
1076
1077
1078 void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
1079 {
1080 altH.orig = &H;
1081
1082
1083 #ifdef NTL_HAVE_LL_TYPE
1084 altH.mem.kill();
1085 altH.row.kill();
1086
1087 #ifdef NTL_HAVE_AVX
1088 altH.dmem.kill();
1089 altH.drow.kill();
1090 #endif
1091
1092 if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; }
1093
1094 altH.n = F.n;
1095 altH.m = H.H.length()-1;
1096
1097 long p = zz_p::modulus();
1098 long n = altH.n;
1099 long m = altH.m;
1100
1101
1102 #ifdef NTL_HAVE_AVX
1103 if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) &&
1104 m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) {
1105 altH.strategy = 3;
1106 altH.pinv_L = sp_PrepRem(p);
1107 }
1108 else
1109 #endif
1110 if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
1111 cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
1112 altH.strategy = 1;
1113 altH.pinv_L = sp_PrepRem(p);
1114 }
1115 else {
1116 altH.strategy = 2;
1117 altH.pinv_LL = make_sp_ll_reduce_struct(p);
1118 }
1119
1120
1121 if (altH.strategy == 1 || altH.strategy == 2) {
1122
1123 altH.row.SetLength(n);
1124 long **row = altH.row.elts();
1125
1126 const long AllocAmt = 1L << 18;
1127
1128 long BlockSize = (AllocAmt + m - 1)/m;
1129 long NumBlocks = (n + BlockSize - 1)/BlockSize;
1130
1131 altH.mem.SetLength(NumBlocks);
1132
1133 for (long i = 0; i < NumBlocks; i++) {
1134 long first = i*BlockSize;
1135 long last = min(n, first + BlockSize);
1136 altH.mem[i].SetLength((last-first)*m);
1137 for (long j = first; j < last; j++) {
1138 row[j] = altH.mem[i].elts() + (j-first)*m;
1139 }
1140 }
1141
1142 for (long i = 0; i < m; i++) {
1143 const zz_p* ptr = H.H[i].rep.elts();
1144 long len = H.H[i].rep.length();
1145 for (long j = 0; j < len; j++)
1146 row[j][i] = rep(ptr[j]);
1147 for (long j = len; j < n; j++)
1148 row[j][i] = 0;
1149 }
1150 }
1151 #ifdef NTL_HAVE_AVX
1152 else {
1153
1154 // sanity check
1155 if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow");
1156
1157 long npanels = (n+15)/16;
1158 long panel_size = 16*m;
1159
1160 const long AllocAmt = 1L << 18;
1161
1162 long BlockSize = (AllocAmt + panel_size - 1)/panel_size;
1163 long NumBlocks = (npanels + BlockSize - 1)/BlockSize;
1164
1165 altH.dmem.SetLength(NumBlocks);
1166 altH.drow.SetLength(npanels);
1167 double **drow = altH.drow.elts();
1168
1169 for (long i = 0; i < NumBlocks; i++) {
1170 long first = i*BlockSize;
1171 long last = min(npanels, first + BlockSize);
1172 altH.dmem[i].SetLength((last-first)*panel_size);
1173
1174 double *ptr = altH.dmem[i].get();
1175
1176 for (long j = first; j < last; j++)
1177 drow[j] = ptr + (j-first)*panel_size;
1178 }
1179
1180 for (long i = 0; i < m; i++) {
1181 const zz_p *ptr = H.H[i].rep.elts();
1182 long len = H.H[i].rep.length();
1183 for (long j = 0; j < len; j++)
1184 drow[j/16][(i*16) + (j%16)] = rep(ptr[j]);
1185 for (long j = len; j < npanels*16; j++)
1186 drow[j/16][(i*16) + (j%16)] = 0;
1187 }
1188 }
1189
1190 #endif
1191
1192
1193 #endif
1194 }
1195
1196
1197 #ifdef NTL_HAVE_LL_TYPE
1198
1199
1200 #ifdef NTL_HAVE_AVX
1201 static
1202 void mul16rowsD(double *x, const double *a, const double *b, long n)
1203 {
1204 __m256d avec0, avec1, avec2, avec3;
1205
1206 __m256d acc0 = _mm256_setzero_pd();
1207 __m256d acc1 = _mm256_setzero_pd();
1208 __m256d acc2 = _mm256_setzero_pd();
1209 __m256d acc3 = _mm256_setzero_pd();
1210
1211 __m256d bvec;
1212
1213 for (long i = 0; i < n; i++) {
1214 bvec = _mm256_broadcast_sd(&b[i]);
1215
1216 avec0 = _mm256_load_pd(a); a += 4;
1217 avec1 = _mm256_load_pd(a); a += 4;
1218 avec2 = _mm256_load_pd(a); a += 4;
1219 avec3 = _mm256_load_pd(a); a += 4;
1220
1221 #ifdef NTL_HAVE_FMA
1222
1223 acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
1224 acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
1225 acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
1226 acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
1227
1228 #else
1229
1230 acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
1231 acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
1232 acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
1233 acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
1234
1235 #endif
1236
1237 }
1238
1239 _mm256_store_pd(x + 0*4, acc0);
1240 _mm256_store_pd(x + 1*4, acc1);
1241 _mm256_store_pd(x + 2*4, acc2);
1242 _mm256_store_pd(x + 3*4, acc3);
1243 }
1244
1245 static
1246 void mul16rows2D(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
1247 {
1248 __m256d avec0, avec1, avec2, avec3;
1249
1250 __m256d acc0 = _mm256_setzero_pd();
1251 __m256d acc1 = _mm256_setzero_pd();
1252 __m256d acc2 = _mm256_setzero_pd();
1253 __m256d acc3 = _mm256_setzero_pd();
1254
1255 __m256d acc0_ = _mm256_setzero_pd();
1256 __m256d acc1_ = _mm256_setzero_pd();
1257 __m256d acc2_ = _mm256_setzero_pd();
1258 __m256d acc3_ = _mm256_setzero_pd();
1259
1260
1261 __m256d bvec;
1262 __m256d bvec_;
1263
1264 for (long i = 0; i < n; i++) {
1265 bvec = _mm256_broadcast_sd(&b[i]);
1266 bvec_ = _mm256_broadcast_sd(&b_[i]);
1267
1268 avec0 = _mm256_load_pd(a); a += 4;
1269 avec1 = _mm256_load_pd(a); a += 4;
1270 avec2 = _mm256_load_pd(a); a += 4;
1271 avec3 = _mm256_load_pd(a); a += 4;
1272
1273 #ifdef NTL_HAVE_FMA
1274
1275 acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
1276 acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
1277 acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
1278 acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
1279
1280 acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_);
1281 acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_);
1282 acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_);
1283 acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_);
1284
1285 #else
1286 acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
1287 acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
1288 acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
1289 acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
1290
1291 acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_);
1292 acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_);
1293 acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_);
1294 acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_);
1295
1296 #endif
1297
1298 }
1299
1300 _mm256_store_pd(x + 0*4, acc0);
1301 _mm256_store_pd(x + 1*4, acc1);
1302 _mm256_store_pd(x + 2*4, acc2);
1303 _mm256_store_pd(x + 3*4, acc3);
1304
1305 _mm256_store_pd(x_ + 0*4, acc0_);
1306 _mm256_store_pd(x_ + 1*4, acc1_);
1307 _mm256_store_pd(x_ + 2*4, acc2_);
1308 _mm256_store_pd(x_ + 3*4, acc3_);
1309 }
1310
1311
1312 #endif
1313
1314
1315
1316 static
1317 void InnerProduct_LL(zz_pX& x, const vec_zz_p& v, long low, long high,
1318 const zz_pXAltArgument& H, long n)
1319 {
1320 high = min(high, v.length()-1);
1321 long len = high-low+1;
1322 if (len <= 0) {
1323 clear(x);
1324 return;
1325 }
1326
1327 x.rep.SetLength(n);
1328 zz_p *xp = x.rep.elts();
1329
1330 long p = zz_p::modulus();
1331 sp_ll_reduce_struct pinv = H.pinv_LL;
1332
1333 const zz_p *vp = v.elts() + low;
1334
1335 for (long i = 0; i < n; i++)
1336 xp[i].LoopHole() = InnerProd_LL(H.row[i], vp, len, p, pinv);
1337
1338 x.normalize();
1339 }
1340
1341 static
1342 void CompMod_LL(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
1343 const zz_pXModulus& F)
1344 {
1345 if (deg(g) <= 0) {
1346 x = g;
1347 return;
1348 }
1349
1350
1351 zz_pX s, t;
1352
1353 long m = A.m;
1354 long l = ((g.rep.length()+m-1)/m) - 1;
1355
1356 zz_pXMultiplier M;
1357 build(M, A.orig->H[m], F);
1358
1359 InnerProduct_LL(t, g.rep, l*m, l*m + m - 1, A, F.n);
1360 for (long i = l-1; i >= 0; i--) {
1361 InnerProduct_LL(s, g.rep, i*m, i*m + m - 1, A, F.n);
1362 MulMod(t, t, M, F);
1363 add(t, t, s);
1364 }
1365
1366 x = t;
1367 }
1368
1369 static
1370 void InnerProduct_L(zz_pX& x, const vec_zz_p& v, long low, long high,
1371 const zz_pXAltArgument& H, long n)
1372 {
1373 high = min(high, v.length()-1);
1374 long len = high-low+1;
1375 if (len <= 0) {
1376 clear(x);
1377 return;
1378 }
1379
1380 x.rep.SetLength(n);
1381 zz_p *xp = x.rep.elts();
1382
1383 long p = zz_p::modulus();
1384 sp_reduce_struct pinv = H.pinv_L;
1385
1386
1387 const zz_p *vp = v.elts() + low;
1388
1389 for (long i = 0; i < n; i++)
1390 xp[i].LoopHole() = InnerProd_L(H.row[i], vp, len, p, pinv);
1391
1392 x.normalize();
1393 }
1394
1395 static
1396 void CompMod_L(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
1397 const zz_pXModulus& F)
1398 {
1399 if (deg(g) <= 0) {
1400 x = g;
1401 return;
1402 }
1403
1404
1405 zz_pX s, t;
1406
1407 long m = A.m;
1408 long l = ((g.rep.length()+m-1)/m) - 1;
1409
1410 zz_pXMultiplier M;
1411 build(M, A.orig->H[m], F);
1412
1413 InnerProduct_L(t, g.rep, l*m, l*m + m - 1, A, F.n);
1414 for (long i = l-1; i >= 0; i--) {
1415 InnerProduct_L(s, g.rep, i*m, i*m + m - 1, A, F.n);
1416 MulMod(t, t, M, F);
1417 add(t, t, s);
1418 }
1419
1420 x = t;
1421 }
1422
1423
1424 #ifdef NTL_HAVE_AVX
1425
1426 static
1427 void InnerProduct_AVX(zz_pX& x, const Vec<double>& v, long low, long high,
1428 const zz_pXAltArgument& H, long n)
1429 {
1430 high = min(high, v.length()-1);
1431 long len = high-low+1;
1432 if (len <= 0) {
1433 clear(x);
1434 return;
1435 }
1436
1437 x.rep.SetLength(n);
1438 zz_p *xp = x.rep.elts();
1439
1440 long p = zz_p::modulus();
1441 sp_reduce_struct pinv = H.pinv_L;
1442
1443
1444 const double *vp = v.elts() + low;
1445
1446 NTL_AVX_LOCAL_ARRAY(res, double, 16);
1447
1448 long npanels = H.drow.length();
1449
1450 for (long i = 0, first = 0; i < npanels; i++, first += 16) {
1451 mul16rowsD(res, H.drow[i], vp, len);
1452 long last = min(n, first + 16);
1453 for (long ii = first; ii < last; ii++)
1454 xp[ii].LoopHole() = rem((unsigned long) (long) res[ii-first], p, pinv);
1455 }
1456
1457 x.normalize();
1458 }
1459
1460 static
1461 void InnerProduct2_AVX(zz_pX& x, zz_pX& x_, const Vec<double>& v, long low, long low_, long len,
1462 const zz_pXAltArgument& H, long n)
1463 {
1464 x.rep.SetLength(n);
1465 zz_p *xp = x.rep.elts();
1466
1467 x_.rep.SetLength(n);
1468 zz_p *xp_ = x_.rep.elts();
1469
1470 long p = zz_p::modulus();
1471 sp_reduce_struct pinv = H.pinv_L;
1472
1473
1474 const double *vp = v.elts() + low;
1475 const double *vp_ = v.elts() + low_;
1476
1477 NTL_AVX_LOCAL_ARRAY(res, double, 16);
1478 NTL_AVX_LOCAL_ARRAY(res_, double, 16);
1479
1480 long npanels = H.drow.length();
1481
1482 for (long i = 0, first = 0; i < npanels; i++, first += 16) {
1483 mul16rows2D(res, res_, H.drow[i], vp, vp_, len);
1484 long last = min(n, first + 16);
1485 for (long ii = first; ii < last; ii++) {
1486 xp[ii].LoopHole() = rem((unsigned long) (long) res[ii-first], p, pinv);
1487 xp_[ii].LoopHole() = rem((unsigned long) (long) res_[ii-first], p, pinv);
1488 }
1489 }
1490
1491 x.normalize();
1492 x_.normalize();
1493 }
1494
1495 static
1496 void CompMod_AVX(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
1497 const zz_pXModulus& F)
1498 {
1499 if (deg(g) <= 0) {
1500 x = g;
1501 return;
1502 }
1503
1504
1505 zz_pX s, s_, t;
1506
1507 long m = A.m;
1508 long l = ((g.rep.length()+m-1)/m) - 1;
1509
1510 zz_pXMultiplier M;
1511 build(M, A.orig->H[m], F);
1512
1513 long len = g.rep.length();
1514 Vec<double> gg;
1515 gg.SetLength(len);
1516 for (long i = 0; i < len; i++) gg[i] = rep(g.rep[i]);
1517
1518 InnerProduct_AVX(t, gg, l*m, l*m + m - 1, A, F.n);
1519 long i = l-1;
1520 for (; i >= 1; i -= 2) {
1521 InnerProduct2_AVX(s, s_, gg, i*m, (i-1)*m, m, A, F.n);
1522 MulMod(t, t, M, F);
1523 add(t, t, s);
1524 MulMod(t, t, M, F);
1525 add(t, t, s_);
1526 }
1527
1528 if (i >= 0) {
1529 InnerProduct_AVX(s, gg, i*m, i*m + m - 1, A, F.n);
1530 MulMod(t, t, M, F);
1531 add(t, t, s);
1532 }
1533
1534 x = t;
1535 }
1536 #endif
1537
1538
1539
1540 #endif
1541
1542
1543
1544 void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
1545 const zz_pXModulus& F)
1546 {
1547 if (!A.orig) LogicError("CompMod: uninitialized arg");
1548
1549 #ifndef NTL_HAVE_LL_TYPE
1550 CompMod(x, g, *A.orig, F);
1551 #else
1552
1553 switch (A.strategy) {
1554 case 0:
1555 CompMod(x, g, *A.orig, F);
1556 break;
1557
1558 case 1:
1559 CompMod_L(x, g, A, F);
1560 break;
1561
1562 case 2:
1563 CompMod_LL(x, g, A, F);
1564 break;
1565
1566 #ifdef NTL_HAVE_AVX
1567 case 3:
1568 CompMod_AVX(x, g, A, F);
1569 break;
1570
1571 #endif
1572
1573 default:
1574 LogicError("CompMod: bad strategy");
1575 }
1576 #endif
1577
1578 }
1579
1580
1581
1582 // END zz_pXAltArgument variation
1583
1584
1585
10681586
10691587 static void StripZeroes(vec_zz_p& x)
10701588 {
0
10
21 #include <NTL/lzz_pXFactoring.h>
3 #include <NTL/vec_vec_lzz_p.h>
2 #include <NTL/mat_lzz_p.h>
43 #include <NTL/FacVec.h>
54
65 #include <NTL/new.h>
143142
144143
145144 static
146 void BuildMatrix(vec_vec_zz_p& M,
145 void BuildMatrix(mat_zz_p& M,
147146 long n, const zz_pX& g, const zz_pXModulus& F, long verbose)
148147 {
149 long i, j, m;
150148 zz_pXMultiplier G;
151149 zz_pX h;
152150
153 M.SetLength(n);
154 for (i = 0; i < n; i++)
155 M[i].SetLength(n);
151 M.SetDims(n, n);
156152
157153 build(G, g, F);
158154
159155 set(h);
160 for (j = 0; j < n; j++) {
161 if (verbose && j % 10 == 0) cerr << "+";
162
163 m = deg(h);
164 for (i = 0; i < n; i++) {
165 if (i <= m)
166 M[i][j] = h.rep[i];
167 else
168 clear(M[i][j]);
169 }
170
171 if (j < n-1)
156 for (long i = 0; i < n; i++) {
157 if (verbose && i % 10 == 0) cerr << "+";
158
159 VectorCopy(M[i], h, n);
160
161 if (i < n-1)
172162 MulMod(h, h, G, F);
173163 }
174164
175 for (i = 0; i < n; i++)
165 for (long i = 0; i < n; i++)
176166 add(M[i][i], M[i][i], -1);
177167
178168 }
225215
226216
227217 static
228 void RandomBasisElt(zz_pX& g, const vec_long& D,
229 const vec_vec_zz_p& M)
230 {
231 zz_p t1, t2;
232
233 long n = D.length();
234
235 long i, j, s;
236
237 g.rep.SetLength(n);
238
239 vec_zz_p& v = g.rep;
240
241 for (j = n-1; j >= 0; j--) {
242 if (D[j] == -1)
243 random(v[j]);
244 else {
245 i = D[j];
246
247 // v[j] = sum_{s=j+1}^{n-1} v[s]*M[i,s]
248
249 clear(t1);
250
251 for (s = j+1; s < n; s++) {
252 mul(t2, v[s], M[i][s]);
253 add(t1, t1, t2);
254 }
255
256 v[j] = t1;
257 }
258 }
259
218 void RandomBasisElt(zz_pX& g, mat_zz_p& ker)
219 {
220 long r = ker.NumRows();
221 long n = ker.NumCols();
222
223 vec_zz_p v;
224 v.SetLength(r);
225 for (long i = 0; i < r; i++) random(v[i]);
226
227 mul(g.rep, v, ker);
260228 g.normalize();
261229 }
262230
386354 PowerXMod(g, p, F);
387355 if (verbose) { cerr << (GetTime()-t) << "\n"; }
388356
389 vec_long D;
390 long r;
391
392 vec_vec_zz_p M;
357 mat_zz_p M, ker;
393358
394359 if (verbose) { cerr << "building matrix..."; t = GetTime(); }
395360 BuildMatrix(M, n, g, F, verbose);
396361 if (verbose) { cerr << (GetTime()-t) << "\n"; }
397362
398363 if (verbose) { cerr << "diagonalizing..."; t = GetTime(); }
399 NullSpace(r, D, M, verbose);
364 kernel(ker, M);
400365 if (verbose) { cerr << (GetTime()-t) << "\n"; }
401366
367
368 M.kill();
369
370 long r = ker.NumRows();
402371
403372 if (verbose) cerr << "number of factors = " << r << "\n";
404373
412381
413382 vec_zz_p roots;
414383
415 RandomBasisElt(g, D, M);
384 RandomBasisElt(g, ker);
416385 MinPolyMod(h, g, F, r);
417 if (deg(h) == r) M.kill();
418386 FindRoots(roots, h);
419387 FindFactors(factors, f, g, roots);
420388
424392
425393 while (factors.length() < r) {
426394 if (verbose) cerr << "+";
427 RandomBasisElt(g, D, M);
395 RandomBasisElt(g, ker);
428396 S.kill();
429397 for (i = 0; i < factors.length(); i++) {
430398 const zz_pX& f = factors[i];
673641 return !IsX(s);
674642 }
675643
676 NTL_THREAD_LOCAL long zz_pX_BlockingFactor = 10;
644 NTL_CHEAP_THREAD_LOCAL long zz_pX_BlockingFactor = 10;
677645
678646 void DDF(vec_pair_zz_pX_long& factors, const zz_pX& ff, const zz_pX& hh,
679647 long verbose)
15291497
15301498 /************* NEW DDF ****************/
15311499
1532 NTL_THREAD_LOCAL long zz_pX_GCDTableSize = 4;
1533 NTL_THREAD_LOCAL static vec_zz_pX *BabyStepFile = 0;
1534 NTL_THREAD_LOCAL static vec_zz_pX *GiantStepFile = 0;
1535 NTL_THREAD_LOCAL static zz_pXArgument *HHH = 0;
1536 NTL_THREAD_LOCAL static long OldN = 0;
1500 NTL_CHEAP_THREAD_LOCAL long zz_pX_GCDTableSize = 4;
1501 static NTL_CHEAP_THREAD_LOCAL vec_zz_pX *BabyStepFile = 0;
1502 static NTL_CHEAP_THREAD_LOCAL vec_zz_pX *GiantStepFile = 0;
1503 static NTL_CHEAP_THREAD_LOCAL zz_pXArgument *HHH = 0;
1504 static NTL_CHEAP_THREAD_LOCAL zz_pXAltArgument *HHH1 = 0;
1505 static NTL_CHEAP_THREAD_LOCAL long OldN = 0;
15371506
15381507
15391508 static
15671536 else {
15681537 zz_pXArgument H;
15691538 build(H, h, F, 2*rootn);
1539
1540 zz_pXAltArgument H1;
1541 build(H1, H, F);
15701542
15711543
15721544 for (i = 1; i <= k-1; i++) {
15731545 (*BabyStepFile)(i) = h1;
15741546
1575 CompMod(h1, h1, H, F);
1576 if (verbose) cerr << "+";
1547 CompMod(h1, h1, H1, F);
1548 if (verbose) cerr << ".";
15771549 }
15781550 }
15791551
15911563 build(F, f);
15921564
15931565 build(*HHH, h, F, 2*SqrRoot(F.n));
1566 build(*HHH1, *HHH, F);
15941567
15951568 OldN = F.n;
15961569
16771650 rem(last, last, F);
16781651 for (long i = 0; i < (*HHH).H.length(); i++)
16791652 rem((*HHH).H[i], (*HHH).H[i], F);
1653 build(*HHH1, *HHH, F);
16801654 OldN = F.n;
16811655 }
16821656
16831657 (*GiantStepFile).SetLength(l+1);
1684 CompMod((*GiantStepFile)(l+1), last, *HHH, F);
1658 CompMod((*GiantStepFile)(l+1), last, *HHH1, F);
16851659 g = (*GiantStepFile)(l+1);
16861660 }
16871661 else if (deg((*GiantStepFile)(gs)) >= F.n)
19101884 }
19111885
19121886 long B = deg(f)/2;
1887
19131888 long k = SqrRoot(B);
1889
1890 // we double the number of baby steps if it seems like
1891 // baby steps are significantly cheaper than giant steps.
1892 // The calculations below are closely tied to a test in GenerateBabySteps:
1893 // if nbm >= sdf/2, then scale should be 1 (baby steps and giant steps balanced)
1894 if (B >= 500) {
1895 long sdf = SqrRoot(deg(f));
1896 long nbm = NumBits(zz_p::modulus());
1897 double scale = 0.25*double(sdf)/double(nbm);
1898 if (scale < 1) scale = 1;
1899 if (scale > 2) scale = 2;
1900 k = long(scale*k);
1901 }
1902
19141903 long l = (B+k-1)/k;
19151904
19161905 vec_zz_pX local_BabyStepFile;
19171906 vec_zz_pX local_GiantStepFile;
19181907 zz_pXArgument local_HHH;
1908 zz_pXAltArgument local_HHH1;
19191909
19201910 BabyStepFile = &local_BabyStepFile;
19211911 GiantStepFile = &local_GiantStepFile;
19221912 HHH = &local_HHH;
1913 HHH1 = &local_HHH1;
19231914
19241915 zz_pX h1;
19251916 GenerateBabySteps(h1, f, h, k, verbose);
1515 #define NTL_WIDE_DOUBLE_DP ((wide_double(1L<<52)))
1616 #define NTL_QUAD_FLOAT_SPLIT ((((double)(1L<<27)))+1.0)
1717 #define NTL_EXT_DOUBLE (0)
18
19
18 #define NTL_FMA_DETECTED (1)
2019
2120
2221
1212
1313 CXXFLAGS=-g -O2
1414 # Flags for the C++ compiler
15
16 CXXAUTOFLAGS=
17 # Flags for the C++ compiler, automatically generated by configuration script
1518
1619
1720 AR=ar
6972
7073 GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
7174 GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
72 GMP_OPT_LIB=# -lgmp # GMP
75 GMP_OPT_LIB=-lgmp # GMP
7376 # uncomment these if using GMP
7477
7578
136139 O16=$(O15)
137140 O17=$(O16)
138141 O18=$(O17) xdouble.o
139 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
142 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140143
141144 OBJ=$(O19)
142145
161164 S16=$(S15)
162165 S17=$(S16)
163166 S18=$(S17) xdouble.c
164 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
167 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165168
166169 SRC = $(S19)
167170
193196 IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194197 IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195198 IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197 IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199 INCL=$(IN20)
199 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
200 INCL=$(IN19)
200201
201202
202203
212213 # test source files
213214
214215 TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215 TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
216 TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216217 TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217218 TS4=$(TS3) ThreadTest.c ExceptionTest.c
218219 TS = $(TS4)
219220
220221 # scripts
221222
222 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
223 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223224 SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224225
225226 SCRIPTS=$(SCRIPTS2)
226227
227228 # auxilliary source
228229
229 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230 GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
230 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
231 GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231232 GP=GetPID1.c GetPID2.c TestGetPID.c
232 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
233 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
234
235 AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233236
234237
235238
236239 # documentation
237240
238241
239 D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
242 D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240243 D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241244 D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242245 D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt
252255 D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253256 D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254257 D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255 D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
258 D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256259 D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257260
258261 TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt
263266 TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264267 TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265268 TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
269 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267270
268271 TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269272
275278 HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276279 HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277280 HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
281 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279282
280283 HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281284
287290 # test program executables
288291
289292 PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290 PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
293 PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291294 PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292295 PROGS = $(PROG3)
293296
294297 # things to save to a tar file
295298
296299 SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297 SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
300 SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298301 SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299302 SFILES=$(SFI3)
300303
309312 NTL_INCLUDE = -I../include -I.
310313 # NTL needs this to find its include files
311314
312 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
315 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
316
317 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315318
316319
317320
341344 # setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342345
343346 setup2:
347 echo "*** CheckFeature log ***" > CheckFeature.log
344348 sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345349 sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346 sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347 sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
350 sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
351 sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
352 sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
353 sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348354
349355 # setup3 generates the file ../include/NTL/gmp_aux.h
350356 # The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h
382388 GetPID.o: GetPID.c
383389 $(LCOMP) $(COMPILE) GetPID.c
384390
385 CheckPCLMUL: CheckPCLMUL.c
386 $(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
391 CheckCompile: CheckCompile.c
392 $(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
393
387394
388395 .c.o:
389396 $(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<
460467
461468 clobber:
462469 rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463 cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464 cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
470 sh ResetFeatures '..'
465471 rm -f ../include/NTL/gmp_aux.h
466 sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
472 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467473 rm -f *.o
468474 rm -rf small
469475 rm -f cfileout mfileout
471477 rm -f all
472478
473479 clean:
474 sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
480 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475481 rm -f *.o
476482 rm -rf small
477483 # - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR
497503
498504
499505 package:
506 ./configure --nowrite
507 cp mfileout def_makefile
508 cp cfileout ../include/NTL/def_config.h
500509 sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501510 rm -rf `cat DIRNAME`
502511 rm -f `cat DIRNAME`.tar
508517 rm -rf `cat DIRNAME`
509518
510519 winpack:
520 ./configure --nowrite NTL_GMP_LIP=off
521 cp mfileout def_makefile
522 cp cfileout ../include/NTL/def_config.h
511523 sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512524 rm -rf `cat WINDIR`
513525 rm -f `cat WINDIR`.zip
526538
527539 WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528540 WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
541 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530542
531543 WOBJ = $(WO3)
532544
538550 MulTimeTest:
539551 $(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540552
541 PolyTimeTest:
542 $(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543553
544554 Poly1TimeTest:
545555 $(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
556 Poly2TimeTest:
557 $(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
558 Poly3TimeTest:
559 $(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546560
547561
548562 GF2XTimeTest:
314314
315315
316316
317 void solve(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b)
317 static
318 void solve_impl(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b, bool trans)
318319
319320 {
320321 long n = A.NumRows();
335336 mat_GF2 M;
336337 M.SetDims(n, n+1);
337338
338 for (i = 0; i < n; i++) {
339 AddToCol(M, i, A[i]);
339 if (trans) {
340 for (i = 0; i < n; i++) {
341 AddToCol(M, i, A[i]);
342 }
343 }
344 else {
345 for (i = 0; i < n; i++) {
346 VectorCopy(M[i], A[i], n+1);
347 }
340348 }
341349
342350 AddToCol(M, n, b);
397405 return;
398406 }
399407
408 void solve(ref_GF2 d, vec_GF2& x, const mat_GF2& A, const vec_GF2& b)
409 {
410 solve_impl(d, x, A, b, true);
411 }
412
413 void solve(ref_GF2 d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b)
414 {
415 solve_impl(d, x, A, b, false);
416 }
400417
401418
402419 void inv(ref_GF2 d, mat_GF2& X, const mat_GF2& A)
281281 }
282282
283283
284 void solve(GF2E& d, vec_GF2E& X,
285 const mat_GF2E& A, const vec_GF2E& b)
284 static
285 void solve_impl(GF2E& d, vec_GF2E& X, const mat_GF2E& A, const vec_GF2E& b, bool trans)
286286
287287 {
288288 long n = A.NumRows();
310310
311311 for (i = 0; i < n; i++) {
312312 M[i].SetSize(n+1, 2*GF2E::WordLength());
313 for (j = 0; j < n; j++)
314 M[i][j] = rep(A[j][i]);
313
314 if (trans)
315 for (j = 0; j < n; j++) M[i][j] = rep(A[j][i]);
316 else
317 for (j = 0; j < n; j++) M[i][j] = rep(A[i][j]);
318
315319 M[i][n] = rep(b[i]);
316320 }
317321
377381 }
378382
379383 conv(d, det);
384 }
385
386 void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b)
387 {
388 solve_impl(d, x, A, b, true);
389 }
390
391 void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b)
392 {
393 solve_impl(d, x, A, b, false);
380394 }
381395
382396 void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A)
313313 }
314314
315315
316 void solve(ZZ_p& d, vec_ZZ_p& X,
317 const mat_ZZ_p& A, const vec_ZZ_p& b)
316 static
317 void solve_impl(ZZ_p& d, vec_ZZ_p& X, const mat_ZZ_p& A, const vec_ZZ_p& b, bool trans)
318318
319319 {
320320 long n = A.NumRows();
344344
345345 for (i = 0; i < n; i++) {
346346 M[i].SetSize(n+1, t1.size());
347 for (j = 0; j < n; j++)
348 M[i][j] = rep(A[j][i]);
347
348 if (trans)
349 for (j = 0; j < n; j++) M[i][j] = rep(A[j][i]);
350 else
351 for (j = 0; j < n; j++) M[i][j] = rep(A[i][j]);
352
349353 M[i][n] = rep(b[i]);
350354 }
351355
413417 }
414418
415419 conv(d, det);
420 }
421
422 void solve(ZZ_p& d, vec_ZZ_p& x, const mat_ZZ_p& A, const vec_ZZ_p& b)
423 {
424 solve_impl(d, x, A, b, true);
425 }
426
427 void solve(ZZ_p& d, const mat_ZZ_p& A, vec_ZZ_p& x, const vec_ZZ_p& b)
428 {
429 solve_impl(d, x, A, b, false);
416430 }
417431
418432 void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A)
315315 }
316316
317317
318 void solve(ZZ_pE& d, vec_ZZ_pE& X,
319 const mat_ZZ_pE& A, const vec_ZZ_pE& b)
318 static
319 void solve_impl(ZZ_pE& d, vec_ZZ_pE& X, const mat_ZZ_pE& A, const vec_ZZ_pE& b, bool trans)
320320
321321 {
322322 long n = A.NumRows();
344344
345345 for (i = 0; i < n; i++) {
346346 M[i].SetLength(n+1);
347 for (j = 0; j < n; j++) {
348 M[i][j].rep.SetMaxLength(2*deg(p)-1);
349 M[i][j] = rep(A[j][i]);
347 if (trans) {
348 for (j = 0; j < n; j++) {
349 M[i][j].rep.SetMaxLength(2*deg(p)-1);
350 M[i][j] = rep(A[j][i]);
351 }
352 }
353 else {
354 for (j = 0; j < n; j++) {
355 M[i][j].rep.SetMaxLength(2*deg(p)-1);
356 M[i][j] = rep(A[i][j]);
357 }
350358 }
351359 M[i][n].rep.SetMaxLength(2*deg(p)-1);
352360 M[i][n] = rep(b[i]);
416424 }
417425
418426 conv(d, det);
427 }
428
429 void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b)
430 {
431 solve_impl(d, x, A, b, true);
432 }
433
434 void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b)
435 {
436 solve_impl(d, x, A, b, false);
419437 }
420438
421439 void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A)
33 #include <NTL/vec_long.h>
44
55
6 #include <NTL/BasicThreadPool.h>
7
8
9
10 #ifdef NTL_HAVE_AVX
11 #include <immintrin.h>
12 #endif
613
714 NTL_START_IMPL
15
16
17 #define PAR_THRESH_SQ (200)
18 #define PAR_THRESH (40000)
19
20
21 // *******************************************************
22 //
23 // Matrix Window data structure: perhaps some day this
24 // will be made public.
25 //
26 // *******************************************************
27
28 struct mat_window_zz_p {
29 mat_zz_p &A;
30 long r_offset;
31 long c_offset;
32 long nrows;
33 long ncols;
34
35 mat_window_zz_p(mat_zz_p& _A) :
36 A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
37
38 mat_window_zz_p(const mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
39 A(w.A)
40 {
41 if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
42 LogicError("mat_window_zz_p: bad args");
43
44 r_offset = w.r_offset + r1;
45 c_offset = w.c_offset + c1;
46 nrows = r2-r1;
47 ncols = c2-c1;
48 }
49
50 zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
51
52 long NumRows() const { return nrows; }
53 long NumCols() const { return ncols; }
54
55 };
56
57
58 struct const_mat_window_zz_p {
59 const mat_zz_p &A;
60 long r_offset;
61 long c_offset;
62 long nrows;
63 long ncols;
64
65 const_mat_window_zz_p(const mat_zz_p& _A) :
66 A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
67
68 const_mat_window_zz_p(const mat_window_zz_p& w) :
69 A(w.A), r_offset(w.r_offset), c_offset(w.c_offset), nrows(w.nrows), ncols(w.ncols) { }
70
71 const_mat_window_zz_p(const const_mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
72 A(w.A)
73 {
74 if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
75 LogicError("const_mat_window_zz_p: bad args");
76
77 r_offset = w.r_offset + r1;
78 c_offset = w.c_offset + c1;
79 nrows = r2-r1;
80 ncols = c2-c1;
81 }
82
83 const zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
84
85 long NumRows() const { return nrows; }
86 long NumCols() const { return ncols; }
87
88 };
89
90 void add(const mat_window_zz_p& X,
91 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
92 {
93 long n = A.NumRows();
94 long m = A.NumCols();
95
96 if (B.NumRows() != n || B.NumCols() != m)
97 LogicError("matrix add: dimension mismatch");
98
99 if (X.NumRows() != n || X.NumCols() != m)
100 LogicError("matrix add: dimension mismatch");
101
102 long p = zz_p::modulus();
103
104 for (long i = 0; i < n; i++) {
105 zz_p *x = X[i];
106 const zz_p *a = A[i];
107 const zz_p *b = B[i];
108 for (long j = 0; j < m; j++) {
109 x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
110 }
111 }
112 }
113
114 void sub(const mat_window_zz_p& X,
115 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
116 {
117 long n = A.NumRows();
118 long m = A.NumCols();
119
120 if (B.NumRows() != n || B.NumCols() != m)
121 LogicError("matrix sub: dimension mismatch");
122
123 if (X.NumRows() != n || X.NumCols() != m)
124 LogicError("matrix sub: dimension mismatch");
125
126 long p = zz_p::modulus();
127
128 for (long i = 0; i < n; i++) {
129 zz_p *x = X[i];
130 const zz_p *a = A[i];
131 const zz_p *b = B[i];
132 for (long j = 0; j < m; j++) {
133 x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
134 }
135 }
136 }
137
138
139 void clear(const mat_window_zz_p& X)
140 {
141 long n = X.NumRows();
142 long m = X.NumCols();
143
144 for (long i = 0; i < n; i++)
145 for (long j = 0; j < m; j++)
146 clear(X[i][j]);
147 }
148
149
150
151 // ***********************************************************
152
153
154
155
8156
9157
10158 void add(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
16164 LogicError("matrix add: dimension mismatch");
17165
18166 X.SetDims(n, m);
167
168 long p = zz_p::modulus();
19169
20 long i, j;
21 for (i = 1; i <= n; i++)
22 for (j = 1; j <= m; j++)
23 add(X(i,j), A(i,j), B(i,j));
170 for (long i = 0; i < n; i++) {
171 zz_p *x = X[i].elts();
172 const zz_p *a = A[i].elts();
173 const zz_p *b = B[i].elts();
174 for (long j = 0; j < m; j++) {
175 x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
176 }
177 }
24178 }
25179
26180 void sub(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
32186 LogicError("matrix sub: dimension mismatch");
33187
34188 X.SetDims(n, m);
189
190 long p = zz_p::modulus();
35191
192 for (long i = 0; i < n; i++) {
193 zz_p *x = X[i].elts();
194 const zz_p *a = A[i].elts();
195 const zz_p *b = B[i].elts();
196 for (long j = 0; j < m; j++) {
197 x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
198 }
199 }
200
201 }
202
203
204
205
206
207 void diag(mat_zz_p& X, long n, zz_p d)
208 {
209 X.SetDims(n, n);
36210 long i, j;
211
37212 for (i = 1; i <= n; i++)
38 for (j = 1; j <= m; j++)
39 sub(X(i,j), A(i,j), B(i,j));
40 }
41
42
43 // some local buffers
44
45 NTL_THREAD_LOCAL static vec_long mul_aux_vec;
46 NTL_THREAD_LOCAL static Vec<mulmod_precon_t> precon_vec;
47
48
49
50 static
51 void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
52 {
53 long n = A.NumRows();
54 long l = A.NumCols();
55 long m = B.NumCols();
56
57 if (l != B.NumRows())
58 LogicError("matrix mul: dimension mismatch");
59
60 X.SetDims(n, m);
61
62 if (m > 1) { // new preconditioning code
63
64 long p = zz_p::modulus();
65 mulmod_t pinv = zz_p::ModulusInverse();
66
67
68 vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
69 mul_aux_vec.SetLength(m);
70 long *acc = mul_aux_vec.elts();
71
72 long i, j, k;
73
74 for (i = 0; i < n; i++) {
75 const zz_p* ap = A[i].elts();
76
77 for (j = 0; j < m; j++) acc[j] = 0;
78
79 for (k = 0; k < l; k++) {
80 long aa = rep(ap[k]);
81 if (aa != 0) {
82 const zz_p* bp = B[k].elts();
83 long T1;
84 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
85
86 for (j = 0; j < m; j++) {
87 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
88 acc[j] = AddMod(acc[j], T1, p);
89 }
90 }
91 }
92
93 zz_p *xp = X[i].elts();
94 for (j = 0; j < m; j++)
95 xp[j].LoopHole() = acc[j];
96 }
97 }
98 else { // just use the old code, w/o preconditioning
99
100 long p = zz_p::modulus();
101 mulmod_t pinv = zz_p::ModulusInverse();
102
103 long i, j, k;
104 long acc, tmp;
105
106 for (i = 1; i <= n; i++) {
107 for (j = 1; j <= m; j++) {
108 acc = 0;
109 for(k = 1; k <= l; k++) {
110 tmp = MulMod(rep(A(i,k)), rep(B(k,j)), p, pinv);
111 acc = AddMod(acc, tmp, p);
112 }
113 X(i,j).LoopHole() = acc;
114 }
115 }
116
117 }
118 }
119
120 void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
121 {
122 if (&X == &A || &X == &B) {
123 mat_zz_p tmp;
124 mul_aux(tmp, A, B);
125 X = tmp;
126 }
127 else
128 mul_aux(X, A, B);
129 }
130
131
132 void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
133 {
134 long l = a.length();
135 long m = B.NumCols();
136
137 if (l != B.NumRows())
138 LogicError("matrix mul: dimension mismatch");
139
140 if (m == 0) {
141
142 x.SetLength(0);
143
144 }
145 else if (m == 1) {
146
147 long p = zz_p::modulus();
148 mulmod_t pinv = zz_p::ModulusInverse();
149
150 long acc, tmp;
151 long k;
152
153 acc = 0;
154 for(k = 1; k <= l; k++) {
155 tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
156 acc = AddMod(acc, tmp, p);
157 }
158
159 x.SetLength(1);
160 x(1).LoopHole() = acc;
161
162 }
163 else { // m > 1. precondition
164
165
166 long p = zz_p::modulus();
167 mulmod_t pinv = zz_p::ModulusInverse();
168
169 vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
170 mul_aux_vec.SetLength(m);
171 long *acc = mul_aux_vec.elts();
172
173 long j, k;
174
175
176 const zz_p* ap = a.elts();
177
178 for (j = 0; j < m; j++) acc[j] = 0;
179
180 for (k = 0; k < l; k++) {
181 long aa = rep(ap[k]);
182 if (aa != 0) {
183 const zz_p* bp = B[k].elts();
184 long T1;
185 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
186
187 for (j = 0; j < m; j++) {
188 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
189 acc[j] = AddMod(acc[j], T1, p);
190 }
191 }
192 }
193
194 x.SetLength(m);
195 zz_p *xp = x.elts();
196 for (j = 0; j < m; j++)
197 xp[j].LoopHole() = acc[j];
198 }
199 }
200
201
202 void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
203 {
204 long n = A.NumRows();
205 long l = A.NumCols();
206
207 if (l != b.length())
208 LogicError("matrix mul: dimension mismatch");
209
210 x.SetLength(n);
211 zz_p* xp = x.elts();
212
213 long p = zz_p::modulus();
214 mulmod_t pinv = zz_p::ModulusInverse();
215
216 long i, k;
217 long acc, tmp;
218
219 const zz_p* bp = b.elts();
220
221 if (n <= 1) {
222
223 for (i = 0; i < n; i++) {
224 acc = 0;
225 const zz_p* ap = A[i].elts();
226
227 for (k = 0; k < l; k++) {
228 tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
229 acc = AddMod(acc, tmp, p);
230 }
231
232 xp[i].LoopHole() = acc;
233 }
234
235 }
236 else {
237
238 Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
239 precon_vec.SetLength(l);
240 mulmod_precon_t *bpinv = precon_vec.elts();
241
242 for (k = 0; k < l; k++)
243 bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
244
245 for (i = 0; i < n; i++) {
246 acc = 0;
247 const zz_p* ap = A[i].elts();
248
249 for (k = 0; k < l; k++) {
250 tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
251 acc = AddMod(acc, tmp, p);
252 }
253
254 xp[i].LoopHole() = acc;
255 }
256 }
257 }
258
259 void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
260 {
261 if (&b == &x || A.position1(x) != -1) {
262 vec_zz_p tmp;
263 mul_aux(tmp, A, b);
264 x = tmp;
265 }
266 else
267 mul_aux(x, A, b);
268
269 }
270
271
272 void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
213 for (j = 1; j <= n; j++)
214 if (i == j)
215 X(i, j) = d;
216 else
217 clear(X(i, j));
218 }
219
220 long IsDiag(const mat_zz_p& A, long n, zz_p d)
221 {
222 if (A.NumRows() != n || A.NumCols() != n)
223 return 0;
224
225 long i, j;
226
227 for (i = 1; i <= n; i++)
228 for (j = 1; j <= n; j++)
229 if (i != j) {
230 if (!IsZero(A(i, j))) return 0;
231 }
232 else {
233 if (A(i, j) != d) return 0;
234 }
235
236 return 1;
237 }
238
239 void negate(mat_zz_p& X, const mat_zz_p& A)
273240 {
274241 long n = A.NumRows();
275242 long m = A.NumCols();
276243
244
277245 X.SetDims(n, m);
278246
279 long i, j;
280
281 if (n == 0 || m == 0 || (n == 1 && m == 1)) {
282
283 for (i = 0; i < n; i++)
284 for (j = 0; j < m; j++)
285 mul(X[i][j], A[i][j], b);
286
287 }
288 else {
289
290 long p = zz_p::modulus();
291 mulmod_t pinv = zz_p::ModulusInverse();
292 long bb = rep(b);
293 mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
294
295 for (i = 0; i < n; i++) {
296 const zz_p *ap = A[i].elts();
297 zz_p *xp = X[i].elts();
298
299 for (j = 0; j < m; j++)
300 xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
301 }
302
303 }
304 }
305
306 void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
307 {
308 zz_p b;
309 b = b_in;
310 mul(X, A, b);
311 }
312
313
314
315
316
247 long p = zz_p::modulus();
248
249 for (long i = 0; i < n; i++) {
250 zz_p *x = X[i].elts();
251 const zz_p *a = A[i].elts();
252 for (long j = 0; j < m; j++) {
253 x[j].LoopHole() = NegateMod(rep(a[j]), p);
254 }
255 }
256 }
257
258 long IsZero(const mat_zz_p& a)
259 {
260 long n = a.NumRows();
261 long i;
262
263 for (i = 0; i < n; i++)
264 if (!IsZero(a[i]))
265 return 0;
266
267 return 1;
268 }
269
270 void clear(mat_zz_p& x)
271 {
272 long n = x.NumRows();
273 long i;
274 for (i = 0; i < n; i++)
275 clear(x[i]);
276 }
277
317278
318279 void ident(mat_zz_p& X, long n)
319280 {
329290 }
330291
331292
332
333 void determinant(zz_p& d, const mat_zz_p& M_in)
334 {
335 long k, n;
336 long i, j;
337 long pos;
338 zz_p t1, t2, t3;
339 zz_p *x, *y;
340
341 mat_zz_p M;
342 M = M_in;
343
344 n = M.NumRows();
345
346 if (M.NumCols() != n)
347 LogicError("determinant: nonsquare matrix");
348
349 if (n == 0) {
350 set(d);
351 return;
352 }
353
354 zz_p det;
355
356 set(det);
357
358 long p = zz_p::modulus();
359 mulmod_t pinv = zz_p::ModulusInverse();
360
361 for (k = 0; k < n; k++) {
362 pos = -1;
363 for (i = k; i < n; i++) {
364 if (!IsZero(M[i][k])) {
365 pos = i;
366 break;
367 }
368 }
369
370 if (pos != -1) {
371 if (k != pos) {
372 swap(M[pos], M[k]);
373 negate(det, det);
374 }
375
376 mul(det, det, M[k][k]);
377
378 inv(t3, M[k][k]);
379
380 for (i = k+1; i < n; i++) {
381 // M[i] = M[i] - M[k]*M[i,k]*t3
382
383 mul(t1, M[i][k], t3);
384 negate(t1, t1);
385
386 x = M[i].elts() + (k+1);
387 y = M[k].elts() + (k+1);
388
389 long T1 = rep(t1);
390 mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
391 long T2;
392
393 for (j = k+1; j < n; j++, x++, y++) {
394 // *x = *x + (*y)*t1
395
396 T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
397 x->LoopHole() = AddMod(rep(*x), T2, p);
398 }
399 }
400 }
401 else {
402 clear(d);
403 return;
404 }
405 }
406
407 d = det;
408 }
409
410
411
412
413293 long IsIdent(const mat_zz_p& A, long n)
414294 {
415295 if (A.NumRows() != n || A.NumCols() != n)
461341 }
462342
463343
464 void solve(zz_p& d, vec_zz_p& X,
465 const mat_zz_p& A, const vec_zz_p& b)
466
467 {
468 long n = A.NumRows();
469
470 if (A.NumCols() != n)
471 LogicError("solve: nonsquare matrix");
472
473
474 if (b.length() != n)
475 LogicError("solve: dimension mismatch");
476
477 if (n == 0) {
478 set(d);
479 X.SetLength(0);
480 return;
481 }
482
483 long i, j, k, pos;
484 zz_p t1, t2, t3;
485 zz_p *x, *y;
486
487 mat_zz_p M;
488 M.SetDims(n, n+1);
489 for (i = 0; i < n; i++) {
490 for (j = 0; j < n; j++)
491 M[i][j] = A[j][i];
492 M[i][n] = b[i];
493 }
494
495 zz_p det;
496 set(det);
497
498 long p = zz_p::modulus();
499 mulmod_t pinv = zz_p::ModulusInverse();
500
501 for (k = 0; k < n; k++) {
502 pos = -1;
503 for (i = k; i < n; i++) {
504 if (!IsZero(M[i][k])) {
505 pos = i;
506 break;
507 }
508 }
509
510 if (pos != -1) {
511 if (k != pos) {
512 swap(M[pos], M[k]);
513 negate(det, det);
514 }
515
516 mul(det, det, M[k][k]);
517
518 inv(t3, M[k][k]);
519 M[k][k] = t3;
520
521
522 for (i = k+1; i < n; i++) {
523 // M[i] = M[i] - M[k]*M[i,k]*t3
524
525 mul(t1, M[i][k], t3);
526 negate(t1, t1);
527
528 x = M[i].elts() + (k+1);
529 y = M[k].elts() + (k+1);
530
531 long T1 = rep(t1);
532 mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
533 long T2;
534
535 for (j = k+1; j <= n; j++, x++, y++) {
536 // *x = *x + (*y)*t1
537
538 T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
539 x->LoopHole() = AddMod(rep(*x), T2, p);
540 }
541 }
542 }
543 else {
544 clear(d);
545 return;
546 }
547 }
548
549 X.SetLength(n);
550 for (i = n-1; i >= 0; i--) {
551 clear(t1);
552 for (j = i+1; j < n; j++) {
553 mul(t2, X[j], M[i][j]);
554 add(t1, t1, t2);
555 }
556 sub(t1, M[i][n], t1);
557 mul(X[i], t1, M[i][i]);
558 }
559
560 d = det;
561 }
562
563 void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A)
564 {
565 long n = A.NumRows();
566 if (A.NumCols() != n)
567 LogicError("inv: nonsquare matrix");
568
569 if (n == 0) {
570 set(d);
571 X.SetDims(0, 0);
572 return;
573 }
574
575 long i, j, k, pos;
576 zz_p t1, t2, t3;
577 zz_p *x, *y;
578
579 mat_zz_p M;
580 M.SetDims(n, 2*n);
581 for (i = 0; i < n; i++) {
582 for (j = 0; j < n; j++) {
583 M[i][j] = A[i][j];
584 clear(M[i][n+j]);
585 }
586 set(M[i][n+i]);
587 }
588
589 zz_p det;
590 set(det);
591
592 long p = zz_p::modulus();
593 mulmod_t pinv = zz_p::ModulusInverse();
594
595 for (k = 0; k < n; k++) {
596 pos = -1;
597 for (i = k; i < n; i++) {
598 if (!IsZero(M[i][k])) {
599 pos = i;
600 break;
601 }
602 }
603
604 if (pos != -1) {
605 if (k != pos) {
606 swap(M[pos], M[k]);
607 negate(det, det);
608 }
609
610 mul(det, det, M[k][k]);
611
612 inv(t3, M[k][k]);
613 M[k][k] = t3;
614
615 for (i = k+1; i < n; i++) {
616 // M[i] = M[i] - M[k]*M[i,k]*t3
617
618 mul(t1, M[i][k], t3);
619 negate(t1, t1);
620
621 x = M[i].elts() + (k+1);
622 y = M[k].elts() + (k+1);
623
624 long T1 = rep(t1);
625 mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
626 long T2;
627
628 for (j = k+1; j < 2*n; j++, x++, y++) {
629 // *x = *x + (*y)*t1
630
631 T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
632 x->LoopHole() = AddMod(rep(*x), T2, p);
633 }
634 }
635 }
636 else {
637 clear(d);
638 return;
639 }
640 }
641
642 X.SetDims(n, n);
643 for (k = 0; k < n; k++) {
644 for (i = n-1; i >= 0; i--) {
645 clear(t1);
646 for (j = i+1; j < n; j++) {
647 mul(t2, X[j][k], M[i][j]);
648 add(t1, t1, t2);
649 }
650 sub(t1, M[i][n+k], t1);
651 mul(X[i][k], t1, M[i][i]);
652 }
653 }
654
655 d = det;
656 }
657
658 long gauss(mat_zz_p& M, long w)
659 {
660 long k, l;
661 long i, j;
662 long pos;
663 zz_p t1, t2, t3;
664 zz_p *x, *y;
665
666 long n = M.NumRows();
667 long m = M.NumCols();
668
669 if (w < 0 || w > m)
670 LogicError("gauss: bad args");
671
672 long p = zz_p::modulus();
673 mulmod_t pinv = zz_p::ModulusInverse();
674 long T1, T2;
675
676 l = 0;
677 for (k = 0; k < w && l < n; k++) {
678
679 pos = -1;
680 for (i = l; i < n; i++) {
681 if (!IsZero(M[i][k])) {
682 pos = i;
683 break;
684 }
685 }
686
687 if (pos != -1) {
688 swap(M[pos], M[l]);
689
690 inv(t3, M[l][k]);
691 negate(t3, t3);
692
693 for (i = l+1; i < n; i++) {
694 // M[i] = M[i] + M[l]*M[i,k]*t3
695
696 mul(t1, M[i][k], t3);
697
698 T1 = rep(t1);
699 mulmod_precon_t T1pinv = PrepMulModPrecon(T1, p, pinv);
700
701 clear(M[i][k]);
702
703 x = M[i].elts() + (k+1);
704 y = M[l].elts() + (k+1);
705
706 for (j = k+1; j < m; j++, x++, y++) {
707 // *x = *x + (*y)*t1
708
709 T2 = MulModPrecon(rep(*y), T1, p, T1pinv);
710 T2 = AddMod(T2, rep(*x), p);
711 (*x).LoopHole() = T2;
712 }
713 }
714
715 l++;
716 }
717 }
718
719 return l;
720 }
721
722 long gauss(mat_zz_p& M)
723 {
724 return gauss(M, M.NumCols());
725 }
726
727 void image(mat_zz_p& X, const mat_zz_p& A)
728 {
729 mat_zz_p M;
730 M = A;
731 long r = gauss(M);
732 M.SetDims(r, M.NumCols());
733 X = M;
734 }
735
736 void kernel(mat_zz_p& X, const mat_zz_p& A)
737 {
738 long m = A.NumRows();
739 long n = A.NumCols();
740
741 mat_zz_p M;
742 long r;
743
744 transpose(M, A);
745 r = gauss(M);
746
747 X.SetDims(m-r, m);
748
749 long i, j, k, s;
750 zz_p t1, t2;
751
752 vec_long D;
753 D.SetLength(m);
754 for (j = 0; j < m; j++) D[j] = -1;
755
756 vec_zz_p inverses;
757 inverses.SetLength(m);
758
759 j = -1;
760 for (i = 0; i < r; i++) {
761 do {
762 j++;
763 } while (IsZero(M[i][j]));
764
765 D[j] = i;
766 inv(inverses[j], M[i][j]);
767 }
768
769 for (k = 0; k < m-r; k++) {
770 vec_zz_p& v = X[k];
771 long pos = 0;
772 for (j = m-1; j >= 0; j--) {
773 if (D[j] == -1) {
774 if (pos == k)
775 set(v[j]);
776 else
777 clear(v[j]);
778 pos++;
779 }
780 else {
781 i = D[j];
782
783 clear(t1);
784
785 for (s = j+1; s < m; s++) {
786 mul(t2, v[s], M[i][s]);
787 add(t1, t1, t2);
788 }
789
790 mul(t1, t1, inverses[j]);
791 negate(v[j], t1);
792 }
793 }
794 }
795 }
796
797
798
799
800
801 void diag(mat_zz_p& X, long n, zz_p d)
802 {
803 X.SetDims(n, n);
804 long i, j;
805
806 for (i = 1; i <= n; i++)
807 for (j = 1; j <= n; j++)
808 if (i == j)
809 X(i, j) = d;
810 else
811 clear(X(i, j));
812 }
813
814 long IsDiag(const mat_zz_p& A, long n, zz_p d)
815 {
816 if (A.NumRows() != n || A.NumCols() != n)
817 return 0;
818
819 long i, j;
820
821 for (i = 1; i <= n; i++)
822 for (j = 1; j <= n; j++)
823 if (i != j) {
824 if (!IsZero(A(i, j))) return 0;
825 }
826 else {
827 if (A(i, j) != d) return 0;
828 }
829
830 return 1;
831 }
832
833 void negate(mat_zz_p& X, const mat_zz_p& A)
834 {
835 long n = A.NumRows();
836 long m = A.NumCols();
837
838
839 X.SetDims(n, m);
840
841 long i, j;
842 for (i = 1; i <= n; i++)
843 for (j = 1; j <= m; j++)
844 negate(X(i,j), A(i,j));
845 }
846
847 long IsZero(const mat_zz_p& a)
848 {
849 long n = a.NumRows();
850 long i;
851
852 for (i = 0; i < n; i++)
853 if (!IsZero(a[i]))
854 return 0;
855
856 return 1;
857 }
858
859 void clear(mat_zz_p& x)
860 {
861 long n = x.NumRows();
862 long i;
863 for (i = 0; i < n; i++)
864 clear(x[i]);
865 }
866
867
868 mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
869 {
870 mat_zz_p res;
871 add(res, a, b);
872 NTL_OPT_RETURN(mat_zz_p, res);
873 }
874
875 mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
876 {
877 mat_zz_p res;
878 mul_aux(res, a, b);
879 NTL_OPT_RETURN(mat_zz_p, res);
880 }
881
882 mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
883 {
884 mat_zz_p res;
885 sub(res, a, b);
886 NTL_OPT_RETURN(mat_zz_p, res);
887 }
888
889
890 mat_zz_p operator-(const mat_zz_p& a)
891 {
892 mat_zz_p res;
893 negate(res, a);
894 NTL_OPT_RETURN(mat_zz_p, res);
895 }
896
897
898 vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
899 {
900 vec_zz_p res;
901 mul_aux(res, a, b);
902 NTL_OPT_RETURN(vec_zz_p, res);
903 }
904
905 vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
906 {
907 vec_zz_p res;
908 mul(res, a, b);
909 NTL_OPT_RETURN(vec_zz_p, res);
910 }
911
912 void inv(mat_zz_p& X, const mat_zz_p& A)
913 {
914 zz_p d;
915 inv(d, X, A);
916 if (d == 0) ArithmeticError("inv: non-invertible matrix");
917 }
918
919 void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e)
344
345
346 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax)
920347 {
921348 if (A.NumRows() != A.NumCols()) LogicError("power: non-square matrix");
922349
940367 }
941368
942369 if (e < 0)
943 inv(X, T1);
370 relaxed_inv(X, T1, relax);
944371 else
945372 X = T1;
946373 }
947374
375
376
377 // ******************************************************************
378 //
379 // matrix-vector multiplication code
380 //
381 // ******************************************************************
382
383
384
385
386
387
388 void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
389 {
390 long l = a.length();
391 long m = B.NumCols();
392
393 if (l != B.NumRows())
394 LogicError("matrix mul: dimension mismatch");
395
396 if (m == 0) {
397
398 x.SetLength(0);
399
400 }
401 else if (m == 1) {
402
403 long p = zz_p::modulus();
404 mulmod_t pinv = zz_p::ModulusInverse();
405
406 long acc, tmp;
407 long k;
408
409 acc = 0;
410 for(k = 1; k <= l; k++) {
411 tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
412 acc = AddMod(acc, tmp, p);
413 }
414
415 x.SetLength(1);
416 x(1).LoopHole() = acc;
417
418 }
419 else { // m > 1. precondition and EXEC_RANGE
420
421
422 long p = zz_p::modulus();
423 mulmod_t pinv = zz_p::ModulusInverse();
424
425 NTL_TLS_LOCAL(vec_long, mul_aux_vec);
426 vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
427 mul_aux_vec.SetLength(m);
428 long *acc = mul_aux_vec.elts();
429
430 const zz_p* ap = a.elts();
431
432 for (long j = 0; j < m; j++) acc[j] = 0;
433
434 const bool seq = double(l)*double(m) < PAR_THRESH;
435
436 NTL_GEXEC_RANGE(seq, m, first, last) {
437
438 for (long k = 0; k < l; k++) {
439 long aa = rep(ap[k]);
440 if (aa != 0) {
441 const zz_p* bp = B[k].elts();
442 long T1;
443 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
444
445 for (long j = first; j < last; j++) {
446 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
447 acc[j] = AddMod(acc[j], T1, p);
448 }
449 }
450 }
451
452 } NTL_GEXEC_RANGE_END
453
454 x.SetLength(m);
455 zz_p *xp = x.elts();
456 for (long j = 0; j < m; j++)
457 xp[j].LoopHole() = acc[j];
458 }
459 }
460
461
462 void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
463 {
464 long n = A.NumRows();
465 long l = A.NumCols();
466
467 if (l != b.length())
468 LogicError("matrix mul: dimension mismatch");
469
470 x.SetLength(n);
471 zz_p* xp = x.elts();
472
473 long p = zz_p::modulus();
474 const zz_p* bp = b.elts();
475
476 const bool seq = double(n)*double(l) < PAR_THRESH;
477
478
479 #ifdef NTL_HAVE_LL_TYPE
480
481 if (cast_unsigned(l) <= (~(0UL))/cast_unsigned(p-1) &&
482 cast_unsigned(l)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
483
484 sp_reduce_struct red_struct = zz_p::red_struct();
485
486 NTL_GEXEC_RANGE(seq, n, first, last) {
487
488 for (long i = first; i < last; i++) {
489 xp[i].LoopHole() = InnerProd_L(A[i].elts(), bp, l, p, red_struct);
490 }
491
492 } NTL_GEXEC_RANGE_END
493 }
494 else {
495 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
496
497 NTL_GEXEC_RANGE(seq, n, first, last) {
498
499 for (long i = first; i < last; i++) {
500 xp[i].LoopHole() = InnerProd_LL(A[i].elts(), bp, l, p, ll_red_struct);
501 }
502
503 } NTL_GEXEC_RANGE_END
504
505 }
506
507 #else
508
509 mulmod_t pinv = zz_p::ModulusInverse();
510
511 if (n <= 1) {
512
513 for (long i = 0; i < n; i++) {
514 long acc = 0;
515 const zz_p* ap = A[i].elts();
516
517 for (long k = 0; k < l; k++) {
518 long tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
519 acc = AddMod(acc, tmp, p);
520 }
521
522 xp[i].LoopHole() = acc;
523 }
524
525 }
526 else {
527
528 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, precon_vec);
529 Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
530 precon_vec.SetLength(l);
531 mulmod_precon_t *bpinv = precon_vec.elts();
532
533 for (long k = 0; k < l; k++)
534 bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
535
536
537 NTL_GEXEC_RANGE(seq, n, first, last) {
538 for (long i = first; i < last; i++) {
539 long acc = 0;
540 const zz_p* ap = A[i].elts();
541
542 for (long k = 0; k < l; k++) {
543 long tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
544 acc = AddMod(acc, tmp, p);
545 }
546
547 xp[i].LoopHole() = acc;
548 }
549 } NTL_GEXEC_RANGE_END
550
551 }
552
553 #endif
554 }
555
556 void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
557 {
558 if (&b == &x || A.position1(x) != -1) {
559 vec_zz_p tmp;
560 mul_aux(tmp, A, b);
561 x = tmp;
562 }
563 else
564 mul_aux(x, A, b);
565
566 }
567
568
569 void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
570 {
571 long n = A.NumRows();
572 long m = A.NumCols();
573
574 X.SetDims(n, m);
575
576
577 if (n == 0 || m == 0 || (n == 1 && m == 1)) {
578 long i, j;
579
580 for (i = 0; i < n; i++)
581 for (j = 0; j < m; j++)
582 mul(X[i][j], A[i][j], b);
583
584 }
585 else {
586
587 long p = zz_p::modulus();
588 mulmod_t pinv = zz_p::ModulusInverse();
589 long bb = rep(b);
590 mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
591
592 const bool seq = double(n)*double(m) < PAR_THRESH;
593
594 NTL_GEXEC_RANGE(seq, n, first, last)
595 long i, j;
596 for (i = first; i < last; i++) {
597 const zz_p *ap = A[i].elts();
598 zz_p *xp = X[i].elts();
599
600 for (j = 0; j < m; j++)
601 xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
602 }
603 NTL_GEXEC_RANGE_END
604
605
606 }
607 }
608
609 void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
610 {
611 zz_p b;
612 b = b_in;
613 mul(X, A, b);
614 }
615
616
617 // ******************************************************************
618 //
619 // Code shared by block-matrix code
620 //
621 // ******************************************************************
622
623 #define MAT_BLK_SZ (32)
624
625
626 #ifdef NTL_HAVE_LL_TYPE
627
628 #ifdef NTL_HAVE_AVX
629
630 #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
631 // max int representable exactly as a double
632 // this assumes NTL_DBL_PRECISION <= NTL_BITS_PER_LONG-2, which is
633 // checked in the code that tests for HAVE_AVX, but we check it here as
634 // well
635
636 #if (NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2)
637 #error "NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2"
638 #endif
639
640
641 // MUL_ADD(a, b, c): a += b*c
642 #ifdef NTL_HAVE_FMA
643 #define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
644 #else
645 #define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
646 #endif
647
648 #if 0
649 static
650 void muladd1_by_32(double *x, const double *a, const double *b, long n)
651 {
652 __m256d avec, bvec;
653
654
655 __m256d acc0=_mm256_load_pd(x + 0*4);
656 __m256d acc1=_mm256_load_pd(x + 1*4);
657 __m256d acc2=_mm256_load_pd(x + 2*4);
658 __m256d acc3=_mm256_load_pd(x + 3*4);
659 __m256d acc4=_mm256_load_pd(x + 4*4);
660 __m256d acc5=_mm256_load_pd(x + 5*4);
661 __m256d acc6=_mm256_load_pd(x + 6*4);
662 __m256d acc7=_mm256_load_pd(x + 7*4);
663
664
665 for (long i = 0; i < n; i++) {
666 avec = _mm256_broadcast_sd(a); a++;
667
668
669 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
670 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
671 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
672 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
673 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
674 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
675 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
676 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
677 }
678
679
680 _mm256_store_pd(x + 0*4, acc0);
681 _mm256_store_pd(x + 1*4, acc1);
682 _mm256_store_pd(x + 2*4, acc2);
683 _mm256_store_pd(x + 3*4, acc3);
684 _mm256_store_pd(x + 4*4, acc4);
685 _mm256_store_pd(x + 5*4, acc5);
686 _mm256_store_pd(x + 6*4, acc6);
687 _mm256_store_pd(x + 7*4, acc7);
688 }
689
690 #else
691
692 static
693 void muladd1_by_32(double *x, const double *a, const double *b, long n)
694 {
695 __m256d acc0=_mm256_load_pd(x + 0*4);
696 __m256d acc1=_mm256_load_pd(x + 1*4);
697 __m256d acc2=_mm256_load_pd(x + 2*4);
698 __m256d acc3=_mm256_load_pd(x + 3*4);
699 __m256d acc4=_mm256_load_pd(x + 4*4);
700 __m256d acc5=_mm256_load_pd(x + 5*4);
701 __m256d acc6=_mm256_load_pd(x + 6*4);
702 __m256d acc7=_mm256_load_pd(x + 7*4);
703
704 long i = 0;
705 for (; i <= n-4; i +=4) {
706
707 // the following code sequences are a bit faster than
708 // just doing 4 _mm256_broadcast_sd's
709 // it requires a to point to aligned storage, however
710
711 #if 1
712 // this one seems slightly faster
713 __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
714 __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
715 #else
716 __m256d avec = _mm256_load_pd(a);
717 __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
718 __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
719
720 #endif
721
722 __m256d avec0 = _mm256_permute_pd(a0101, 0);
723 __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
724 __m256d avec2 = _mm256_permute_pd(a2323, 0);
725 __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
726
727 a += 4;
728
729 __m256d bvec;
730
731 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec0, bvec);
732 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec0, bvec);
733 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec0, bvec);
734 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec0, bvec);
735 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec0, bvec);
736 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec0, bvec);
737 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec0, bvec);
738 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec0, bvec);
739
740 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec1, bvec);
741 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec1, bvec);
742 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec1, bvec);
743 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec1, bvec);
744 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec1, bvec);
745 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec1, bvec);
746 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec1, bvec);
747 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec1, bvec);
748
749 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec2, bvec);
750 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec2, bvec);
751 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec2, bvec);
752 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec2, bvec);
753 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec2, bvec);
754 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec2, bvec);
755 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec2, bvec);
756 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec2, bvec);
757
758 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec3, bvec);
759 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec3, bvec);
760 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec3, bvec);
761 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec3, bvec);
762 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec3, bvec);
763 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec3, bvec);
764 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec3, bvec);
765 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec3, bvec);
766 }
767
768 for (; i < n; i++) {
769 __m256d avec = _mm256_broadcast_sd(a); a++;
770 __m256d bvec;
771
772 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
773 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
774 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
775 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
776 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
777 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
778 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
779 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
780 }
781
782
783 _mm256_store_pd(x + 0*4, acc0);
784 _mm256_store_pd(x + 1*4, acc1);
785 _mm256_store_pd(x + 2*4, acc2);
786 _mm256_store_pd(x + 3*4, acc3);
787 _mm256_store_pd(x + 4*4, acc4);
788 _mm256_store_pd(x + 5*4, acc5);
789 _mm256_store_pd(x + 6*4, acc6);
790 _mm256_store_pd(x + 7*4, acc7);
791 }
792
793 #endif
794
795 // experiment: process two rows at a time
796 #if 1
797 static
798 void muladd2_by_32(double *x, const double *a, const double *b, long n)
799 {
800 __m256d avec0, avec1, bvec;
801 __m256d acc00, acc01, acc02, acc03;
802 __m256d acc10, acc11, acc12, acc13;
803
804
805 // round 0
806
807 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
808 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
809 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
810 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
811
812 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
813 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
814 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
815 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
816
817 for (long i = 0; i < n; i++) {
818 avec0 = _mm256_broadcast_sd(&a[i]);
819 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
820
821 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
822 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
823 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
824 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
825 }
826
827
828 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
829 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
830 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
831 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
832
833 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
834 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
835 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
836 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
837
838 // round 1
839
840 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
841 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
842 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
843 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
844
845 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
846 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
847 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
848 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
849
850 for (long i = 0; i < n; i++) {
851 avec0 = _mm256_broadcast_sd(&a[i]);
852 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
853
854 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
855 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
856 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
857 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
858 }
859
860
861 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
862 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
863 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
864 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
865
866 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
867 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
868 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
869 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
870
871 }
872
873 #else
874
875 static
876 void muladd2_by_32(double *x, const double *a, const double *b, long n)
877 {
878 long i, j;
879 __m256d bvec;
880 __m256d acc00, acc01, acc02, acc03;
881 __m256d acc10, acc11, acc12, acc13;
882
883
884 for (j = 0; j < 2; j++) {
885
886 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
887 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
888 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
889 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
890
891 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
892 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
893 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
894 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
895
896 for (i = 0; i <= n-4; i+=4) {
897 __m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0));
898 __m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2));
899 __m256d avec00 = _mm256_permute_pd(a0_0101, 0);
900 __m256d avec01 = _mm256_permute_pd(a0_0101, 0xf);
901 __m256d avec02 = _mm256_permute_pd(a0_2323, 0);
902 __m256d avec03 = _mm256_permute_pd(a0_2323, 0xf);
903
904 __m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ));
905 __m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ));
906 __m256d avec10 = _mm256_permute_pd(a1_0101, 0);
907 __m256d avec11 = _mm256_permute_pd(a1_0101, 0xf);
908 __m256d avec12 = _mm256_permute_pd(a1_2323, 0);
909 __m256d avec13 = _mm256_permute_pd(a1_2323, 0xf);
910
911 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec);
912 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec);
913 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec);
914 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec);
915
916 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec);
917 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec);
918 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec);
919 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec);
920
921 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec);
922 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec);
923 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec);
924 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec);
925
926 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec);
927 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec);
928 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec);
929 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec);
930 }
931
932 for (; i < n; i++) {
933 __m256d avec0 = _mm256_broadcast_sd(&a[i]);
934 __m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
935
936 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
937 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
938 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
939 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
940 }
941
942
943 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00);
944 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01);
945 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02);
946 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03);
947
948 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10);
949 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11);
950 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12);
951 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13);
952
953 }
954 }
955 #endif
956
957
958
959 // experiment: process three rows at a time
960 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
961 // it could be faster on AVX2/FMA, where there should be enough registers
962
963 static
964 void muladd3_by_32(double *x, const double *a, const double *b, long n)
965 {
966 __m256d avec0, avec1, avec2, bvec;
967 __m256d acc00, acc01, acc02, acc03;
968 __m256d acc10, acc11, acc12, acc13;
969 __m256d acc20, acc21, acc22, acc23;
970
971
972 // round 0
973
974 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
975 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
976 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
977 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
978
979 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
980 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
981 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
982 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
983
984 acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
985 acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
986 acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
987 acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
988
989 for (long i = 0; i < n; i++) {
990 avec0 = _mm256_broadcast_sd(&a[i]);
991 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
992 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
993
994 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
995 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
996 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
997 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
998 }
999
1000
1001 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1002 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1003 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1004 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1005
1006 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1007 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1008 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1009 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1010
1011 _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1012 _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1013 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1014 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1015
1016 // round 1
1017
1018 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1019 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1020 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1021 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1022
1023 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1024 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1025 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1026 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1027
1028 acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
1029 acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
1030 acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
1031 acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
1032
1033 for (long i = 0; i < n; i++) {
1034 avec0 = _mm256_broadcast_sd(&a[i]);
1035 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1036 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1037
1038 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1039 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1040 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1041 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1042 }
1043
1044
1045 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1046 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1047 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1048 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1049
1050 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1051 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1052 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1053 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1054
1055 _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
1056 _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
1057 _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
1058 _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
1059
1060 }
1061
1062 static inline
1063 void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
1064 {
1065 long i = first;
1066 #ifdef NTL_HAVE_FMA
1067 // processing three rows at a time is faster
1068 for (; i <= last-3; i+=3)
1069 muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1070 for (; i < last; i++)
1071 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1072 #else
1073 // process only two rows at a time: not enough registers :-(
1074 for (; i <= last-2; i+=2)
1075 muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1076 for (; i < last; i++)
1077 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1078 #endif
1079 }
1080
1081
1082 // this assumes n is a multiple of 16
1083 static inline
1084 void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1085 {
1086 __m256d xvec0, xvec1, xvec2, xvec3;
1087 __m256d yvec0, yvec1, yvec2, yvec3;
1088
1089 __m256d cvec = _mm256_broadcast_sd(&c);
1090
1091 for (long i = 0; i < n; i += 16, x += 16, y += 16) {
1092 xvec0 = _mm256_load_pd(x+0*4);
1093 xvec1 = _mm256_load_pd(x+1*4);
1094 xvec2 = _mm256_load_pd(x+2*4);
1095 xvec3 = _mm256_load_pd(x+3*4);
1096
1097 yvec0 = _mm256_load_pd(y+0*4);
1098 yvec1 = _mm256_load_pd(y+1*4);
1099 yvec2 = _mm256_load_pd(y+2*4);
1100 yvec3 = _mm256_load_pd(y+3*4);
1101
1102 MUL_ADD(xvec0, yvec0, cvec);
1103 MUL_ADD(xvec1, yvec1, cvec);
1104 MUL_ADD(xvec2, yvec2, cvec);
1105 MUL_ADD(xvec3, yvec3, cvec);
1106
1107 _mm256_store_pd(x + 0*4, xvec0);
1108 _mm256_store_pd(x + 1*4, xvec1);
1109 _mm256_store_pd(x + 2*4, xvec2);
1110 _mm256_store_pd(x + 3*4, xvec3);
1111 }
1112 }
1113
1114 // this one is more general: does not assume that n is a
1115 // multiple of 16
1116 static inline
1117 void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1118 {
1119
1120 __m256d xvec0, xvec1, xvec2, xvec3;
1121 __m256d yvec0, yvec1, yvec2, yvec3;
1122 __m256d cvec;
1123
1124 if (n >= 4)
1125 cvec = _mm256_broadcast_sd(&c);
1126
1127 long i=0;
1128 for (; i <= n-16; i += 16, x += 16, y += 16) {
1129 xvec0 = _mm256_load_pd(x+0*4);
1130 xvec1 = _mm256_load_pd(x+1*4);
1131 xvec2 = _mm256_load_pd(x+2*4);
1132 xvec3 = _mm256_load_pd(x+3*4);
1133
1134 yvec0 = _mm256_load_pd(y+0*4);
1135 yvec1 = _mm256_load_pd(y+1*4);
1136 yvec2 = _mm256_load_pd(y+2*4);
1137 yvec3 = _mm256_load_pd(y+3*4);
1138
1139 MUL_ADD(xvec0, yvec0, cvec);
1140 MUL_ADD(xvec1, yvec1, cvec);
1141 MUL_ADD(xvec2, yvec2, cvec);
1142 MUL_ADD(xvec3, yvec3, cvec);
1143
1144 _mm256_store_pd(x + 0*4, xvec0);
1145 _mm256_store_pd(x + 1*4, xvec1);
1146 _mm256_store_pd(x + 2*4, xvec2);
1147 _mm256_store_pd(x + 3*4, xvec3);
1148 }
1149
1150 for (; i <= n-4; i += 4, x += 4, y += 4) {
1151 xvec0 = _mm256_load_pd(x+0*4);
1152 yvec0 = _mm256_load_pd(y+0*4);
1153 MUL_ADD(xvec0, yvec0, cvec);
1154 _mm256_store_pd(x + 0*4, xvec0);
1155 }
1156
1157 for (; i < n; i++, x++, y++) {
1158 *x += (*y)*c;
1159 }
1160 }
1161
1162 #define AVX_PD_SZ (4)
1163
1164 // experimental: assumes n is a multiple of 4 in the range [0..32]
1165 #if 1
1166 static inline
1167 void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1168 {
1169 n /= 4;
1170 if (n <= 0 || n > 8) return;
1171
1172 x += n*4;
1173 y += n*4;
1174
1175 // n in [1..8]
1176
1177 __m256d xvec, yvec, cvec;
1178
1179 cvec = _mm256_broadcast_sd(&c);
1180
1181 switch (n) {
1182 case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
1183 case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
1184 case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
1185 case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
1186 case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
1187 case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
1188 case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
1189 case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
1190 }
1191
1192 }
1193 #else
1194 static inline
1195 void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1196 {
1197 for (long i = 0; i < n; i++)
1198 x[i] += y[i]*c;
1199 }
1200 #endif
1201
1202 #endif
1203
1204
1205 #define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
1206 //#define DO_MUL(a, b) ((a)*(b))
1207
1208 static
1209 inline void muladd_interval(unsigned long * NTL_RESTRICT x, unsigned long * NTL_RESTRICT y,
1210 unsigned long c, long n)
1211 {
1212 for (long i = 0; i < n; i++)
1213 x[i] += DO_MUL(y[i], c);
1214 }
1215
1216 static
1217 void muladd1_by_32(unsigned long *x, const unsigned long *a, const unsigned long *b,
1218 long n)
1219 {
1220 for (long j = 0; j < MAT_BLK_SZ; j++) {
1221 unsigned long sum = x[j];
1222 long i = 0;
1223
1224 for (; i <= n-4; i += 4) {
1225 sum += DO_MUL(a[i+0], b[i+0]);
1226 sum += DO_MUL(a[i+1], b[i+1]);
1227 sum += DO_MUL(a[i+2], b[i+2]);
1228 sum += DO_MUL(a[i+3], b[i+3]);
1229 }
1230
1231 for (; i < n; i++)
1232 sum += DO_MUL(a[i], b[i]);
1233
1234 x[j] = sum;
1235 b += MAT_BLK_SZ;
1236 }
1237 }
1238
1239 // experiment with shorter int's
1240 static
1241 void muladd1_by_32(unsigned long *x, const unsigned int *a, const unsigned int *b,
1242 long n)
1243 {
1244 for (long j = 0; j < MAT_BLK_SZ; j++) {
1245 unsigned long sum = x[j];
1246 long i = 0;
1247
1248 for (; i <= n-4; i += 4) {
1249 sum += DO_MUL(a[i+0], b[i+0]);
1250 sum += DO_MUL(a[i+1], b[i+1]);
1251 sum += DO_MUL(a[i+2], b[i+2]);
1252 sum += DO_MUL(a[i+3], b[i+3]);
1253 }
1254
1255 for (; i < n; i++)
1256 sum += DO_MUL(a[i], b[i]);
1257
1258 x[j] = sum;
1259 b += MAT_BLK_SZ;
1260 }
1261 }
1262
1263 #if 0
1264 static
1265 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1266 {
1267 for (long j = 0; j < MAT_BLK_SZ; j++) {
1268 unsigned long sum = x[j];
1269 long i = 0;
1270
1271 sum += DO_MUL(a[i+0], b[i+0]);
1272 sum += DO_MUL(a[i+1], b[i+1]);
1273 sum += DO_MUL(a[i+2], b[i+2]);
1274 sum += DO_MUL(a[i+3], b[i+3]);
1275 sum += DO_MUL(a[i+4], b[i+4]);
1276 sum += DO_MUL(a[i+5], b[i+5]);
1277 sum += DO_MUL(a[i+6], b[i+6]);
1278 sum += DO_MUL(a[i+7], b[i+7]);
1279 sum += DO_MUL(a[i+8], b[i+8]);
1280 sum += DO_MUL(a[i+9], b[i+9]);
1281 sum += DO_MUL(a[i+10], b[i+10]);
1282 sum += DO_MUL(a[i+11], b[i+11]);
1283 sum += DO_MUL(a[i+12], b[i+12]);
1284 sum += DO_MUL(a[i+13], b[i+13]);
1285 sum += DO_MUL(a[i+14], b[i+14]);
1286 sum += DO_MUL(a[i+15], b[i+15]);
1287 sum += DO_MUL(a[i+16], b[i+16]);
1288 sum += DO_MUL(a[i+17], b[i+17]);
1289 sum += DO_MUL(a[i+18], b[i+18]);
1290 sum += DO_MUL(a[i+19], b[i+19]);
1291 sum += DO_MUL(a[i+20], b[i+20]);
1292 sum += DO_MUL(a[i+21], b[i+21]);
1293 sum += DO_MUL(a[i+22], b[i+22]);
1294 sum += DO_MUL(a[i+23], b[i+23]);
1295 sum += DO_MUL(a[i+24], b[i+24]);
1296 sum += DO_MUL(a[i+25], b[i+25]);
1297 sum += DO_MUL(a[i+26], b[i+26]);
1298 sum += DO_MUL(a[i+27], b[i+27]);
1299 sum += DO_MUL(a[i+28], b[i+28]);
1300 sum += DO_MUL(a[i+29], b[i+29]);
1301 sum += DO_MUL(a[i+30], b[i+30]);
1302 sum += DO_MUL(a[i+31], b[i+31]);
1303
1304 x[j] = sum;
1305 b += MAT_BLK_SZ;
1306 }
1307 }
1308 #else
1309
1310 // this version is faster (by about 25%) on a Sandybridge machine
1311
1312 #define ONE_STEP_L(i) \
1313 sum += DO_MUL(a[i],b[i]);\
1314 sum_1 += DO_MUL(a[i],b_1[i]);\
1315 sum_2 += DO_MUL(a[i],b_2[i]);\
1316 sum_3 += DO_MUL(a[i],b_3[i])\
1317
1318
1319 static
1320 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1321 {
1322 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1323
1324 unsigned long sum = x[j];
1325 unsigned long sum_1 = x[j+1];
1326 unsigned long sum_2 = x[j+2];
1327 unsigned long sum_3 = x[j+3];
1328
1329 const unsigned long *b_1 = b+MAT_BLK_SZ;
1330 const unsigned long *b_2 = b+2*MAT_BLK_SZ;
1331 const unsigned long *b_3 = b+3*MAT_BLK_SZ;
1332
1333 ONE_STEP_L(0);
1334 ONE_STEP_L(1);
1335 ONE_STEP_L(2);
1336 ONE_STEP_L(3);
1337 ONE_STEP_L(4);
1338 ONE_STEP_L(5);
1339 ONE_STEP_L(6);
1340 ONE_STEP_L(7);
1341 ONE_STEP_L(8);
1342 ONE_STEP_L(9);
1343 ONE_STEP_L(10);
1344 ONE_STEP_L(11);
1345 ONE_STEP_L(12);
1346 ONE_STEP_L(13);
1347 ONE_STEP_L(14);
1348 ONE_STEP_L(15);
1349 ONE_STEP_L(16);
1350 ONE_STEP_L(17);
1351 ONE_STEP_L(18);
1352 ONE_STEP_L(19);
1353 ONE_STEP_L(20);
1354 ONE_STEP_L(21);
1355 ONE_STEP_L(22);
1356 ONE_STEP_L(23);
1357 ONE_STEP_L(24);
1358 ONE_STEP_L(25);
1359 ONE_STEP_L(26);
1360 ONE_STEP_L(27);
1361 ONE_STEP_L(28);
1362 ONE_STEP_L(29);
1363 ONE_STEP_L(30);
1364 ONE_STEP_L(31);
1365
1366 x[j] = sum;
1367 x[j+1] = sum_1;
1368 x[j+2] = sum_2;
1369 x[j+3] = sum_3;
1370
1371 b += 4*MAT_BLK_SZ;
1372 }
1373 }
1374
1375 // experiment with shorter int's
1376 static
1377 void muladd1_by_32_full(unsigned long *x, const unsigned int *a, const unsigned int *b)
1378 {
1379 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1380
1381 unsigned long sum = x[j];
1382 unsigned long sum_1 = x[j+1];
1383 unsigned long sum_2 = x[j+2];
1384 unsigned long sum_3 = x[j+3];
1385
1386 const unsigned int *b_1 = b+MAT_BLK_SZ;
1387 const unsigned int *b_2 = b+2*MAT_BLK_SZ;
1388 const unsigned int *b_3 = b+3*MAT_BLK_SZ;
1389
1390 ONE_STEP_L(0);
1391 ONE_STEP_L(1);
1392 ONE_STEP_L(2);
1393 ONE_STEP_L(3);
1394 ONE_STEP_L(4);
1395 ONE_STEP_L(5);
1396 ONE_STEP_L(6);
1397 ONE_STEP_L(7);
1398 ONE_STEP_L(8);
1399 ONE_STEP_L(9);
1400 ONE_STEP_L(10);
1401 ONE_STEP_L(11);
1402 ONE_STEP_L(12);
1403 ONE_STEP_L(13);
1404 ONE_STEP_L(14);
1405 ONE_STEP_L(15);
1406 ONE_STEP_L(16);
1407 ONE_STEP_L(17);
1408 ONE_STEP_L(18);
1409 ONE_STEP_L(19);
1410 ONE_STEP_L(20);
1411 ONE_STEP_L(21);
1412 ONE_STEP_L(22);
1413 ONE_STEP_L(23);
1414 ONE_STEP_L(24);
1415 ONE_STEP_L(25);
1416 ONE_STEP_L(26);
1417 ONE_STEP_L(27);
1418 ONE_STEP_L(28);
1419 ONE_STEP_L(29);
1420 ONE_STEP_L(30);
1421 ONE_STEP_L(31);
1422
1423 x[j] = sum;
1424 x[j+1] = sum_1;
1425 x[j+2] = sum_2;
1426 x[j+3] = sum_3;
1427
1428 b += 4*MAT_BLK_SZ;
1429 }
1430 }
1431
1432 #endif
1433
1434 static inline
1435 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned int *a, const unsigned int *b, long n)
1436 {
1437 if (n == MAT_BLK_SZ) {
1438 for (long i = first; i < last; i++)
1439 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1440 }
1441 else {
1442 for (long i = first; i < last; i++)
1443 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1444 }
1445 }
1446
1447 static inline
1448 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned long *a, const unsigned long *b, long n)
1449 {
1450 if (n == MAT_BLK_SZ) {
1451 for (long i = first; i < last; i++)
1452 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1453 }
1454 else {
1455 for (long i = first; i < last; i++)
1456 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1457 }
1458 }
1459
1460 #if (NTL_BITS_PER_INT >= NTL_BITS_PER_LONG/2)
1461
1462 typedef unsigned int uhlong;
1463
1464 #else
1465
1466 typedef unsigned long uhlong;
1467
1468 #endif
1469
1470
1471
1472
1473 // NOTE: the following code is hardcoded for MAT_BLK_SZ == 32.
1474 // Also, we special case NTL_BITS_PER_LONG-NTL_SP_NBITS > 2, which
1475 // allows us to accumulate all 32 products without additional carries.
1476
1477 #if (NTL_BITS_PER_LONG-NTL_SP_NBITS > 2)
1478
1479 static
1480 void muladd1_by_32(long *x, const long *a, const long *b,
1481 long n, long p, sp_ll_reduce_struct ll_red_struct)
1482 {
1483 for (long j = 0; j < MAT_BLK_SZ; j++) {
1484
1485 ll_type sum;
1486 ll_init(sum, x[j]);
1487 #if 0
1488 for (long i = 0; i < n; i++)
1489 ll_imul_add(sum, a[i], b[i]);
1490 #else
1491 long i=0;
1492 for(; i <= n-8; i+= 8) {
1493 ll_imul_add(sum, a[i+0], b[i+0]);
1494 ll_imul_add(sum, a[i+1], b[i+1]);
1495 ll_imul_add(sum, a[i+2], b[i+2]);
1496 ll_imul_add(sum, a[i+3], b[i+3]);
1497
1498 ll_imul_add(sum, a[i+4], b[i+4]);
1499 ll_imul_add(sum, a[i+5], b[i+5]);
1500 ll_imul_add(sum, a[i+6], b[i+6]);
1501 ll_imul_add(sum, a[i+7], b[i+7]);
1502 }
1503
1504 for (; i < n; i++)
1505 ll_imul_add(sum, a[i], b[i]);
1506
1507 #endif
1508
1509 unsigned long sum0 = ll_get_lo(sum);
1510 unsigned long sum1 = ll_get_hi(sum);
1511
1512 long res;
1513
1514 if (ll_red_struct.nbits == NTL_SP_NBITS)
1515 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
1516 else
1517 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
1518
1519
1520 x[j] = res;
1521 b += MAT_BLK_SZ;
1522 }
1523 }
1524
1525 #if 0
1526 static
1527 void muladd1_by_32_full(long *x, const long *a, const long *b,
1528 long p, sp_ll_reduce_struct ll_red_struct)
1529 {
1530 for (long j = 0; j < MAT_BLK_SZ; j++) {
1531
1532 ll_type sum;
1533 ll_init(sum, x[j]);
1534
1535 ll_imul_add(sum, a[0], b[0]);
1536 ll_imul_add(sum, a[1], b[1]);
1537 ll_imul_add(sum, a[2], b[2]);
1538 ll_imul_add(sum, a[3], b[3]);
1539 ll_imul_add(sum, a[4], b[4]);
1540 ll_imul_add(sum, a[5], b[5]);
1541 ll_imul_add(sum, a[6], b[6]);
1542 ll_imul_add(sum, a[7], b[7]);
1543 ll_imul_add(sum, a[8], b[8]);
1544 ll_imul_add(sum, a[9], b[9]);
1545 ll_imul_add(sum, a[10], b[10]);
1546 ll_imul_add(sum, a[11], b[11]);
1547 ll_imul_add(sum, a[12], b[12]);
1548 ll_imul_add(sum, a[13], b[13]);
1549 ll_imul_add(sum, a[14], b[14]);
1550 ll_imul_add(sum, a[15], b[15]);
1551 ll_imul_add(sum, a[16], b[16]);
1552 ll_imul_add(sum, a[17], b[17]);
1553 ll_imul_add(sum, a[18], b[18]);
1554 ll_imul_add(sum, a[19], b[19]);
1555 ll_imul_add(sum, a[20], b[20]);
1556 ll_imul_add(sum, a[21], b[21]);
1557 ll_imul_add(sum, a[22], b[22]);
1558 ll_imul_add(sum, a[23], b[23]);
1559 ll_imul_add(sum, a[24], b[24]);
1560 ll_imul_add(sum, a[25], b[25]);
1561 ll_imul_add(sum, a[26], b[26]);
1562 ll_imul_add(sum, a[27], b[27]);
1563 ll_imul_add(sum, a[28], b[28]);
1564 ll_imul_add(sum, a[29], b[29]);
1565 ll_imul_add(sum, a[30], b[30]);
1566 ll_imul_add(sum, a[31], b[31]);
1567
1568 unsigned long sum0 = ll_get_lo(sum);
1569 unsigned long sum1 = ll_get_hi(sum);
1570
1571 long res;
1572
1573 if (ll_red_struct.nbits == NTL_SP_NBITS)
1574 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
1575 else
1576 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
1577
1578
1579 x[j] = res;
1580 b += MAT_BLK_SZ;
1581 }
1582 }
1583
1584 #elif 1
1585 // This version is consistently fastest on tests on Sandybridge and Haswell
1586
1587
1588
1589 #define ONE_STEP(i) \
1590 ll_imul_add(sum, a[i], b[i]);\
1591 ll_imul_add(sum_1, a[i], b_1[i]);\
1592 ll_imul_add(sum_2, a[i], b_2[i]);\
1593 ll_imul_add(sum_3, a[i], b_3[i]);\
1594
1595
1596 void muladd1_by_32_full(long *x, const long *a, const long *b,
1597 long p, sp_ll_reduce_struct ll_red_struct)
1598 {
1599 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1600
1601 ll_type sum, sum_1, sum_2, sum_3;
1602 ll_init(sum, x[j]);
1603 ll_init(sum_1, x[j+1]);
1604 ll_init(sum_2, x[j+2]);
1605 ll_init(sum_3, x[j+3]);
1606
1607 const long *b_1 = b+MAT_BLK_SZ;
1608 const long *b_2 = b+2*MAT_BLK_SZ;
1609 const long *b_3 = b+3*MAT_BLK_SZ;
1610
1611 ONE_STEP(0);
1612 ONE_STEP(1);
1613 ONE_STEP(2);
1614 ONE_STEP(3);
1615 ONE_STEP(4);
1616 ONE_STEP(5);
1617 ONE_STEP(6);
1618 ONE_STEP(7);
1619 ONE_STEP(8);
1620 ONE_STEP(9);
1621 ONE_STEP(10);
1622 ONE_STEP(11);
1623 ONE_STEP(12);
1624 ONE_STEP(13);
1625 ONE_STEP(14);
1626 ONE_STEP(15);
1627 ONE_STEP(16);
1628 ONE_STEP(17);
1629 ONE_STEP(18);
1630 ONE_STEP(19);
1631 ONE_STEP(20);
1632 ONE_STEP(21);
1633 ONE_STEP(22);
1634 ONE_STEP(23);
1635 ONE_STEP(24);
1636 ONE_STEP(25);
1637 ONE_STEP(26);
1638 ONE_STEP(27);
1639 ONE_STEP(28);
1640 ONE_STEP(29);
1641 ONE_STEP(30);
1642 ONE_STEP(31);
1643
1644 unsigned long sum0 = ll_get_lo(sum);
1645 unsigned long sum1 = ll_get_hi(sum);
1646
1647 unsigned long sum0_1 = ll_get_lo(sum_1);
1648 unsigned long sum1_1 = ll_get_hi(sum_1);
1649
1650 unsigned long sum0_2 = ll_get_lo(sum_2);
1651 unsigned long sum1_2 = ll_get_hi(sum_2);
1652
1653 unsigned long sum0_3 = ll_get_lo(sum_3);
1654 unsigned long sum1_3 = ll_get_hi(sum_3);
1655
1656 if (ll_red_struct.nbits == NTL_SP_NBITS) {
1657 x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
1658 x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
1659 x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
1660 x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
1661 }
1662 else {
1663 x[j] = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
1664 x[j+1] = sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
1665 x[j+2] = sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
1666 x[j+3] = sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
1667 }
1668
1669
1670 b += 4*MAT_BLK_SZ;
1671 }
1672 }
1673
1674
1675 #endif
1676
1677 #else
1678
1679
1680 static
1681 void muladd1_by_32(long *x, const long *a, const long *b,
1682 long n, long p, sp_ll_reduce_struct ll_red_struct)
1683 {
1684 for (long j = 0; j < MAT_BLK_SZ; j++) {
1685
1686 ll_type sum;
1687 ll_init(sum, x[j]);
1688
1689 long i = 0;
1690 for (; i < n-16; i++)
1691 ll_imul_add(sum, a[i], b[i]);
1692
1693 ll_type acc21;
1694 ll_init(acc21, ll_get_hi(sum));
1695 unsigned long acc0 = ll_get_lo(sum);
1696 ll_init(sum, acc0);
1697
1698 for (; i < n; i++)
1699 ll_imul_add(sum, a[i], b[i]);
1700
1701 acc0 = ll_get_lo(sum);
1702 ll_add(acc21, ll_get_hi(sum));
1703
1704 long res;
1705
1706 if (ll_red_struct.nbits == NTL_SP_NBITS)
1707 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
1708 else
1709 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
1710
1711 x[j] = res;
1712 b += MAT_BLK_SZ;
1713 }
1714 }
1715
1716 static
1717 void muladd1_by_32_full(long *x, const long *a, const long *b,
1718 long p, sp_ll_reduce_struct ll_red_struct)
1719 {
1720 for (long j = 0; j < MAT_BLK_SZ; j++) {
1721
1722 ll_type sum;
1723 ll_init(sum, x[j]);
1724
1725 ll_imul_add(sum, a[0], b[0]);
1726 ll_imul_add(sum, a[1], b[1]);
1727 ll_imul_add(sum, a[2], b[2]);
1728 ll_imul_add(sum, a[3], b[3]);
1729 ll_imul_add(sum, a[4], b[4]);
1730 ll_imul_add(sum, a[5], b[5]);
1731 ll_imul_add(sum, a[6], b[6]);
1732 ll_imul_add(sum, a[7], b[7]);
1733 ll_imul_add(sum, a[8], b[8]);
1734 ll_imul_add(sum, a[9], b[9]);
1735 ll_imul_add(sum, a[10], b[10]);
1736 ll_imul_add(sum, a[11], b[11]);
1737 ll_imul_add(sum, a[12], b[12]);
1738 ll_imul_add(sum, a[13], b[13]);
1739 ll_imul_add(sum, a[14], b[14]);
1740 ll_imul_add(sum, a[15], b[15]);
1741
1742 ll_type acc21;
1743 ll_init(acc21, ll_get_hi(sum));
1744 unsigned long acc0 = ll_get_lo(sum);
1745 ll_init(sum, acc0);
1746
1747 ll_imul_add(sum, a[16], b[16]);
1748 ll_imul_add(sum, a[17], b[17]);
1749 ll_imul_add(sum, a[18], b[18]);
1750 ll_imul_add(sum, a[19], b[19]);
1751 ll_imul_add(sum, a[20], b[20]);
1752 ll_imul_add(sum, a[21], b[21]);
1753 ll_imul_add(sum, a[22], b[22]);
1754 ll_imul_add(sum, a[23], b[23]);
1755 ll_imul_add(sum, a[24], b[24]);
1756 ll_imul_add(sum, a[25], b[25]);
1757 ll_imul_add(sum, a[26], b[26]);
1758 ll_imul_add(sum, a[27], b[27]);
1759 ll_imul_add(sum, a[28], b[28]);
1760 ll_imul_add(sum, a[29], b[29]);
1761 ll_imul_add(sum, a[30], b[30]);
1762 ll_imul_add(sum, a[31], b[31]);
1763
1764 acc0 = ll_get_lo(sum);
1765 ll_add(acc21, ll_get_hi(sum));
1766
1767 long res;
1768
1769 if (ll_red_struct.nbits == NTL_SP_NBITS)
1770 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
1771 else
1772 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
1773
1774 x[j] = res;
1775 b += MAT_BLK_SZ;
1776 }
1777 }
1778
1779
1780
1781 #endif
1782
1783
1784 static
1785 void muladd1_by_32_half2(long *x, const long *a, const long *b,
1786 long n, long p, sp_ll_reduce_struct ll_red_struct)
1787 {
1788 for (long j = 0; j < MAT_BLK_SZ; j++) {
1789
1790 unsigned long sum[2];
1791 sum[0] = x[j];
1792 sum[1] = 0;
1793
1794 long k=0;
1795 long i=0;
1796 for(; i <= n-16; i+= 16) {
1797 unsigned long lsum = a[i+0]*b[i+0];
1798 lsum += a[i+1]*b[i+1];
1799 lsum += a[i+2]*b[i+2];
1800 lsum += a[i+3]*b[i+3];
1801 lsum += a[i+4]*b[i+4];
1802 lsum += a[i+5]*b[i+5];
1803 lsum += a[i+6]*b[i+6];
1804 lsum += a[i+7]*b[i+7];
1805 lsum += a[i+8]*b[i+8];
1806 lsum += a[i+9]*b[i+9];
1807 lsum += a[i+10]*b[i+10];
1808 lsum += a[i+11]*b[i+11];
1809 lsum += a[i+12]*b[i+12];
1810 lsum += a[i+13]*b[i+13];
1811 lsum += a[i+14]*b[i+14];
1812 lsum += a[i+15]*b[i+15];
1813 sum[k++] += lsum;
1814 }
1815
1816 if (i < n) {
1817 unsigned long lsum = a[i]*b[i];
1818 for (i++; i < n; i++)
1819 lsum += a[i]*b[i];
1820 sum[k++] += lsum;
1821 }
1822
1823
1824 long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
1825 long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
1826 x[j] = AddMod(t0, t1, p);
1827
1828 b += MAT_BLK_SZ;
1829 }
1830 }
1831
1832
1833
1834 // NOTE: oddly, this is slightly faster than the half2 routine, which
1835 // I would have thought would be faster
1836 // DIRT: this assumes MAT_BLK_SZ < (1L << NTL_BITS_PER_LONG/2),
1837 // which will hold unconditionally for MAT_BLK_SZ < 2^16.
1838 static
1839 void muladd1_by_32_half1(long *x, const long *a, const long *b,
1840 long n, long p, sp_ll_reduce_struct ll_red_struct)
1841 {
1842 for (long j = 0; j < MAT_BLK_SZ; j++) {
1843
1844 ll_type sum;
1845 ll_init(sum, x[j]);
1846
1847 long i=0;
1848 for(; i <= n-4; i+= 4) {
1849 unsigned long lsum = a[i+0]*b[i+0];
1850 lsum += a[i+1]*b[i+1];
1851 lsum += a[i+2]*b[i+2];
1852 lsum += a[i+3]*b[i+3];
1853 ll_add(sum, lsum);
1854 }
1855
1856 if (i < n) {
1857 unsigned long lsum = a[i]*b[i];
1858 for (i++; i < n; i++)
1859 lsum += a[i]*b[i];
1860 ll_add(sum, lsum);
1861 }
1862
1863 unsigned long sum0 = ll_get_lo(sum);
1864 unsigned long sum1 = ll_get_hi(sum);
1865 x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
1866
1867 b += MAT_BLK_SZ;
1868 }
1869 }
1870
1871
1872 static inline
1873 void muladd_all_by_32(long first, long last, long *x, const long *a, const long *b, long n,
1874 long p, sp_ll_reduce_struct ll_red_struct)
1875 {
1876 if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
1877 if (n == MAT_BLK_SZ) {
1878 for (long i = first; i < last; i++)
1879 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct);
1880 }
1881 else {
1882 for (long i = first; i < last; i++)
1883 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
1884 }
1885 }
1886 else {
1887 for (long i = first; i < last; i++)
1888 muladd1_by_32_half1(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
1889 }
1890 }
1891
1892
1893
1894 #endif
1895
1896
1897
1898 static
1899 inline void muladd_interval(long * NTL_RESTRICT x, long * NTL_RESTRICT y,
1900 long c, long n, long p, mulmod_t pinv)
1901 {
1902 mulmod_precon_t cpinv = PrepMulModPrecon(c, p, pinv);
1903 for (long i = 0; i < n; i++) {
1904 long t = MulModPrecon(y[i], c, p, cpinv);
1905 x[i] = AddMod(x[i], t, p);
1906 }
1907 }
1908
1909
1910 // ******************************************************************
1911 //
1912 // General matrix multiplication code
1913 //
1914 // ******************************************************************
1915
1916
1917
1918
1919
1920 static
1921 void basic_mul(const mat_window_zz_p& X,
1922 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
1923 {
1924 long n = A.NumRows();
1925 long l = A.NumCols();
1926 long m = B.NumCols();
1927
1928 long p = zz_p::modulus();
1929 mulmod_t pinv = zz_p::ModulusInverse();
1930
1931 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
1932
1933 NTL_GEXEC_RANGE(seq, n, first, last) {
1934
1935 for (long i = first; i < last; i++) {
1936 long j, k;
1937 const zz_p* ap = &A[i][0];
1938
1939 zz_p *xp = &X[i][0];
1940 for (j = 0; j < m; j++) xp[j].LoopHole() = 0;
1941
1942 for (k = 0; k < l; k++) {
1943 long aa = rep(ap[k]);
1944 if (aa != 0) {
1945 const zz_p* bp = &B[k][0];
1946 long T1;
1947 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
1948
1949 for (j = 0; j < m; j++) {
1950 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
1951 xp[j].LoopHole() = AddMod(rep(xp[j]), T1, p);
1952 }
1953 }
1954 }
1955 }
1956
1957 } NTL_GEXEC_RANGE_END
1958 }
1959
1960
1961
1962
1963 #ifdef NTL_HAVE_LL_TYPE
1964
1965 static
1966 void alt_mul_L(const mat_window_zz_p& X,
1967 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
1968 {
1969 long n = A.NumRows();
1970 long l = A.NumCols();
1971 long m = B.NumCols();
1972
1973 long p = zz_p::modulus();
1974 sp_reduce_struct red_struct = zz_p::red_struct();
1975
1976 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
1977
1978 NTL_GEXEC_RANGE(seq, m, first, last) {
1979
1980 Vec<long> B_col;
1981 B_col.SetLength(l);
1982 long *bp = B_col.elts();
1983
1984 long i, j, k;
1985
1986 for (j = first; j < last; j++) {
1987 for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
1988
1989 for (i = 0; i < n; i++) {
1990 const zz_p *ap = &A[i][0];
1991 X[i][j].LoopHole() = InnerProd_L(bp, ap, l, p, red_struct);
1992 }
1993 }
1994
1995 } NTL_GEXEC_RANGE_END
1996 }
1997
1998
1999 static
2000 void alt_mul_LL(const mat_window_zz_p& X,
2001 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2002 {
2003 long n = A.NumRows();
2004 long l = A.NumCols();
2005 long m = B.NumCols();
2006
2007 long p = zz_p::modulus();
2008 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
2009
2010 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2011
2012 NTL_GEXEC_RANGE(seq, m, first, last) {
2013
2014 Vec<long> B_col;
2015 B_col.SetLength(l);
2016 long *bp = B_col.elts();
2017
2018 long i, j, k;
2019
2020 for (j = first; j < last; j++) {
2021 for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
2022
2023 for (i = 0; i < n; i++) {
2024 const zz_p *ap = &A[i][0];
2025 X[i][j].LoopHole() = InnerProd_LL(bp, ap, l, p, ll_red_struct);
2026 }
2027 }
2028
2029 } NTL_GEXEC_RANGE_END
2030 }
2031
2032
2033 #ifdef NTL_HAVE_AVX
2034
2035 static
2036 void blk_mul_DD(const mat_window_zz_p& X,
2037 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2038 {
2039 long n = A.NumRows();
2040 long l = A.NumCols();
2041 long m = B.NumCols();
2042
2043 long p = zz_p::modulus();
2044 sp_reduce_struct red_struct = zz_p::red_struct();
2045
2046 UniqueArray< AlignedArray<double> > A_buf;
2047 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2048 A_buf.SetLength(npanels);
2049
2050 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2051 long k_max = min(kk+MAT_BLK_SZ, l);
2052
2053 A_buf[panel].SetLength(n * MAT_BLK_SZ);
2054 double *abp = &A_buf[panel][0];
2055
2056 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
2057 const zz_p *ap1 = &A[i][0];
2058 for (long k = kk; k < k_max; k++) {
2059 abp[k-kk] = rep(ap1[k]);
2060 }
2061 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
2062 abp[k-kk] = 0;
2063 }
2064 }
2065 }
2066
2067 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2068
2069 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2070
2071 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
2072 NTL_IMPORT(n)
2073 NTL_IMPORT(l)
2074 NTL_IMPORT(m)
2075 NTL_IMPORT(p)
2076 NTL_IMPORT(red_struct)
2077
2078 AlignedArray<double> B_rec;
2079 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
2080 double *brec = B_rec.get();
2081
2082 AlignedArray<double> X_buf;
2083 X_buf.SetLength(n*MAT_BLK_SZ);
2084 double *xbp = X_buf.get();
2085
2086 long jj, kk;
2087 long i, j, k;
2088 long panel;
2089 long xpanel;
2090
2091 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
2092 xpanel++, jj += MAT_BLK_SZ) {
2093
2094 long j_max = min(jj+MAT_BLK_SZ, m);
2095
2096 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
2097
2098 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
2099 long red_count = red_trigger;
2100
2101 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2102 long k_max = min(kk+MAT_BLK_SZ, l);
2103
2104 for (k = kk; k < k_max; k++) {
2105 const zz_p *bp = &B[k][0];
2106 for (j = jj; j < j_max; j++)
2107 brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = rep(bp[j]);
2108 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
2109 brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = 0;
2110 }
2111
2112
2113 if (red_count-MAT_BLK_SZ < 0) {
2114 red_count = red_trigger;
2115 for (i = 0; i < n*MAT_BLK_SZ; i++)
2116 xbp[i] = rem((unsigned long)(long)xbp[i], p, red_struct);
2117 }
2118
2119 red_count = red_count-MAT_BLK_SZ;
2120
2121 const double *abp = &A_buf[panel][0];
2122
2123 muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk);
2124 }
2125
2126
2127 for (i = 0; i < n; i++) {
2128 zz_p *xp = &X[i][0];
2129 for (j = jj; j < j_max; j++)
2130 xp[j].LoopHole() =
2131 rem((unsigned long)(long)xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
2132 }
2133 }
2134
2135 NTL_GEXEC_RANGE_END
2136 }
2137
2138 #endif
2139
2140
2141 static
2142 void blk_mul_LL(const mat_window_zz_p& X,
2143 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2144 {
2145 long n = A.NumRows();
2146 long l = A.NumCols();
2147 long m = B.NumCols();
2148
2149 long p = zz_p::modulus();
2150 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
2151
2152 Vec< Vec<long> > A_buf;
2153 Vec<long *> abufp;
2154 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2155 A_buf.SetLength(npanels);
2156 abufp.SetLength(npanels);
2157
2158 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2159 long k_max = min(kk+MAT_BLK_SZ, l);
2160
2161 A_buf[panel].SetLength(n * MAT_BLK_SZ);
2162 long *abp = A_buf[panel].elts();
2163 abufp[panel] = abp;
2164
2165 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
2166 const zz_p *ap1 = &A[i][0];
2167 for (long k = kk; k < k_max; k++) {
2168 abp[k-kk] = rep(ap1[k]);
2169 }
2170 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
2171 abp[k-kk] = 0;
2172 }
2173 }
2174 }
2175
2176 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2177
2178 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2179
2180 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
2181 NTL_IMPORT(n)
2182 NTL_IMPORT(l)
2183 NTL_IMPORT(m)
2184 NTL_IMPORT(p)
2185 NTL_IMPORT(ll_red_struct)
2186
2187 UniqueArray<long> B_rec;
2188 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
2189 long *brec = B_rec.get();
2190
2191 UniqueArray<long> X_buf;
2192 X_buf.SetLength(n*MAT_BLK_SZ);
2193 long *xbp = X_buf.get();
2194
2195 long jj, kk;
2196 long i, j, k;
2197 long panel;
2198 long xpanel;
2199
2200 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
2201 xpanel++, jj += MAT_BLK_SZ) {
2202
2203 long j_max = min(jj+MAT_BLK_SZ, m);
2204
2205 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
2206
2207 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2208 long k_max = min(kk+MAT_BLK_SZ, l);
2209
2210 // fill brec, transposed
2211
2212 for (k = kk; k < k_max; k++) {
2213 const zz_p *bp = &B[k][0];
2214 for (j = jj; j < j_max; j++)
2215 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
2216 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
2217 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
2218 }
2219
2220 const long *abp = abufp[panel];
2221 muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk, p, ll_red_struct);
2222 }
2223
2224
2225 for (i = 0; i < n; i++) {
2226 zz_p *xp = &X[i][0];
2227 for (j = jj; j < j_max; j++)
2228 xp[j].LoopHole() = xbp[i*MAT_BLK_SZ + (j-jj)];
2229 }
2230 }
2231
2232 NTL_GEXEC_RANGE_END
2233 }
2234
2235
2236 static
2237 void blk_mul_L(const mat_window_zz_p& X,
2238 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2239 {
2240 long n = A.NumRows();
2241 long l = A.NumCols();
2242 long m = B.NumCols();
2243
2244 long p = zz_p::modulus();
2245 sp_reduce_struct red_struct = zz_p::red_struct();
2246
2247 Vec< Vec<uhlong> > A_buf;
2248 Vec<uhlong*> abufp;
2249 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2250 A_buf.SetLength(npanels);
2251 abufp.SetLength(npanels);
2252
2253 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2254 long k_max = min(kk+MAT_BLK_SZ, l);
2255
2256 A_buf[panel].SetLength(n * MAT_BLK_SZ);
2257 uhlong *abp = A_buf[panel].elts();
2258 abufp[panel] = abp;
2259
2260 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
2261 const zz_p *ap1 = &A[i][0];
2262 for (long k = kk; k < k_max; k++) {
2263 abp[k-kk] = rep(ap1[k]);
2264 }
2265 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
2266 abp[k-kk] = 0;
2267 }
2268 }
2269 }
2270
2271 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
2272
2273 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2274
2275 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
2276 NTL_IMPORT(n)
2277 NTL_IMPORT(l)
2278 NTL_IMPORT(m)
2279 NTL_IMPORT(p)
2280 NTL_IMPORT(red_struct)
2281
2282 UniqueArray<uhlong> B_rec;
2283 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
2284 uhlong *brec = B_rec.get();
2285
2286 UniqueArray<unsigned long> X_buf;
2287 X_buf.SetLength(n*MAT_BLK_SZ);
2288 unsigned long *xbp = X_buf.get();
2289
2290 long jj, kk;
2291 long i, j, k;
2292 long panel;
2293 long xpanel;
2294
2295 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
2296 xpanel++, jj += MAT_BLK_SZ) {
2297
2298 long j_max = min(jj+MAT_BLK_SZ, m);
2299
2300 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
2301
2302 unsigned long ured_trigger =
2303 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
2304 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
2305
2306 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
2307
2308 long red_count = red_trigger;
2309
2310 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
2311 long k_max = min(kk+MAT_BLK_SZ, l);
2312
2313 // fill brec, transposed
2314
2315 for (k = kk; k < k_max; k++) {
2316 const zz_p *bp = &B[k][0];
2317 for (j = jj; j < j_max; j++)
2318 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
2319 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
2320 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
2321 }
2322
2323 if (red_count-MAT_BLK_SZ < 0) {
2324 red_count = red_trigger;
2325 for (i = 0; i < n*MAT_BLK_SZ; i++)
2326 xbp[i] = rem(xbp[i], p, red_struct);
2327 }
2328
2329 red_count = red_count-MAT_BLK_SZ;
2330
2331 const uhlong *abp = abufp[panel];
2332
2333 muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk);
2334 }
2335
2336
2337 for (i = 0; i < n; i++) {
2338 zz_p *xp = &X[i][0];
2339 for (j = jj; j < j_max; j++)
2340 xp[j].LoopHole() =
2341 rem(xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
2342 }
2343 }
2344
2345 NTL_GEXEC_RANGE_END
2346 }
2347
2348
2349 #endif
2350
2351
2352
2353
2354 static
2355 void mul_base (const mat_window_zz_p& X,
2356 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2357 {
2358 long n = A.NumRows();
2359 long l = A.NumCols();
2360 long m = B.NumCols();
2361
2362 if (n == 0 || l == 0 || m == 0) {
2363 clear(X);
2364 return;
2365 }
2366
2367
2368 #ifndef NTL_HAVE_LL_TYPE
2369
2370 basic_mul(X, A, B);
2371
2372 #else
2373
2374 if (l < 32) {
2375 //cerr << "basic_mul\n";
2376 basic_mul(X, A, B);
2377 return;
2378 }
2379
2380 long p = zz_p::modulus();
2381
2382 if (n/MAT_BLK_SZ < 4 || l/MAT_BLK_SZ < 4 || m/MAT_BLK_SZ < 4) {
2383 if (cast_unsigned(l) <= (~(0UL))/cast_unsigned(p-1) &&
2384 cast_unsigned(l)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
2385 //cerr << "alt_mul_L\n";
2386 alt_mul_L(X, A, B);
2387 }
2388 else {
2389 //cerr << "alt_mul_LL\n";
2390 alt_mul_LL(X, A, B);
2391 }
2392
2393 return;
2394 }
2395
2396 {
2397 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
2398 if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
2399 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
2400
2401 long V = MAT_BLK_SZ*4;
2402
2403 #ifdef NTL_HAVE_AVX
2404 if (p-1 <= MAX_DBL_INT &&
2405 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
2406 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
2407
2408 // cerr << "block_mul_DD\n";
2409 blk_mul_DD(X, A, B);
2410 }
2411 else
2412 #endif
2413 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
2414 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
2415
2416 //cerr << "blk_mul_L\n";
2417 blk_mul_L(X, A, B);
2418
2419 }
2420 else {
2421
2422 //cerr << "blk_mul_LL\n";
2423 blk_mul_LL(X, A, B);
2424 }
2425 }
2426
2427 #endif
2428
2429
2430 }
2431
2432 // The following implementation of Strassen is derived directly
2433 // from the implementation in FLINT v2.5.2 (see http://www.flintlib.org),
2434 // although a number of details have changed.
2435 // I include the original copyright notice from the file nmod_mat/mul_strassen.c
2436 // in the FLINT distribution.
2437
2438 /*=============================================================================
2439
2440 This file is part of FLINT.
2441
2442 FLINT is free software; you can redistribute it and/or modify
2443 it under the terms of the GNU General Public License as published by
2444 the Free Software Foundation; either version 2 of the License, or
2445 (at your option) any later version.
2446
2447 FLINT is distributed in the hope that it will be useful,
2448 but WITHOUT ANY WARRANTY; without even the implied warranty of
2449 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2450 GNU General Public License for more details.
2451
2452 You should have received a copy of the GNU General Public License
2453 along with FLINT; if not, write to the Free Software
2454 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
2455
2456 =============================================================================*/
2457 /******************************************************************************
2458
2459 Copyright (C) 2008, Martin Albrecht
2460 Copyright (C) 2008, 2009 William Hart.
2461 Copyright (C) 2010, Fredrik Johansson
2462
2463 ******************************************************************************/
2464
2465
2466
2467
2468 void mul_strassen(const mat_window_zz_p& C,
2469 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2470 {
2471 long a, b, c;
2472 long anr, anc, bnr, bnc;
2473
2474
2475 a = A.NumRows();
2476 b = A.NumCols();
2477 c = B.NumCols();
2478
2479
2480 bool use_DD = false;
2481 // this code determines if mul_base triggers blk_mul_DD,
2482 // in which case a higher crossover is used
2483
2484 #if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
2485 {
2486 long V = MAT_BLK_SZ*4;
2487 long p = zz_p::modulus();
2488
2489 if (p-1 <= MAX_DBL_INT &&
2490 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
2491 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
2492 {
2493 use_DD = true;
2494 }
2495 }
2496 #endif
2497
2498 long nt = AvailableThreads();
2499
2500 long xover;
2501 // now we set the crossover -- it is kind of a heauristic
2502 // mess based on nt and use_DD...I've run some tests to
2503 // make sure these settings are reasonable, but a more
2504 // rational approach would be preferable
2505
2506 if (nt > 1) {
2507 if (use_DD || nt > 8192/(2*MAT_BLK_SZ))
2508 xover = 8192;
2509 else
2510 xover = max(800, nt*2*MAT_BLK_SZ);
2511 }
2512 else {
2513 if (use_DD)
2514 xover = 800;
2515 else
2516 xover = 448;
2517 }
2518
2519 if (a <= xover || b <= xover || c <= xover)
2520 {
2521 mul_base(C, A, B);
2522 return;
2523 }
2524
2525 anr = a / 2;
2526 anc = b / 2;
2527 bnr = anc;
2528 bnc = c / 2;
2529
2530 const_mat_window_zz_p A11(A, 0, 0, anr, anc);
2531 const_mat_window_zz_p A12(A, 0, anc, anr, 2*anc);
2532 const_mat_window_zz_p A21(A, anr, 0, 2*anr, anc);
2533 const_mat_window_zz_p A22(A, anr, anc, 2*anr, 2*anc);
2534
2535 const_mat_window_zz_p B11(B, 0, 0, bnr, bnc);
2536 const_mat_window_zz_p B12(B, 0, bnc, bnr, 2*bnc);
2537 const_mat_window_zz_p B21(B, bnr, 0, 2*bnr, bnc);
2538 const_mat_window_zz_p B22(B, bnr, bnc, 2*bnr, 2*bnc);
2539
2540 mat_window_zz_p C11(C, 0, 0, anr, bnc);
2541 mat_window_zz_p C12(C, 0, bnc, anr, 2*bnc);
2542 mat_window_zz_p C21(C, anr, 0, 2*anr, bnc);
2543 mat_window_zz_p C22(C, anr, bnc, 2*anr, 2*bnc);
2544
2545 mat_zz_p X1_store;
2546 X1_store.SetDims(anr, max(bnc, anc));
2547
2548 mat_window_zz_p X1a(X1_store, 0, 0, anr, anc);
2549 mat_window_zz_p X1b(X1_store, 0, 0, anr, bnc);
2550
2551 mat_zz_p X2;
2552 X2.SetDims(anc, bnc);
2553
2554 /*
2555 See Jean-Guillaume Dumas, Clement Pernet, Wei Zhou; "Memory
2556 efficient scheduling of Strassen-Winograd's matrix multiplication
2557 algorithm"; http://arxiv.org/pdf/0707.2347v3 for reference on the
2558 used operation scheduling.
2559 */
2560
2561 sub(X1a, A11, A21);
2562 sub(X2, B22, B12);
2563 mul_strassen(C21, X1a, X2);
2564
2565 add(X1a, A21, A22);
2566 sub(X2, B12, B11);
2567 mul_strassen(C22, X1a, X2);
2568
2569 sub(X1a, X1a, A11);
2570 sub(X2, B22, X2);
2571 mul_strassen(C12, X1a, X2);
2572
2573 sub(X1a, A12, X1a);
2574 mul_strassen(C11, X1a, B22);
2575
2576
2577 mul_strassen(X1b, A11, B11);
2578
2579 add(C12, X1b, C12);
2580 add(C21, C12, C21);
2581 add(C12, C12, C22);
2582 add(C22, C21, C22);
2583 add(C12, C12, C11);
2584 sub(X2, X2, B21);
2585 mul_strassen(C11, A22, X2);
2586
2587 X2.kill();
2588
2589 sub(C21, C21, C11);
2590 mul_strassen(C11, A12, B21);
2591
2592 add(C11, X1b, C11);
2593
2594 X1_store.kill();
2595
2596 if (c > 2*bnc) /* A by last col of B -> last col of C */
2597 {
2598 const_mat_window_zz_p Bc(B, 0, 2*bnc, b, c);
2599 mat_window_zz_p Cc(C, 0, 2*bnc, a, c);
2600
2601 mul_strassen(Cc, A, Bc);
2602 }
2603
2604 if (a > 2*anr) /* last row of A by B -> last row of C */
2605 {
2606 const_mat_window_zz_p Ar(A, 2*anr, 0, a, b);
2607 mat_window_zz_p Cr(C, 2*anr, 0, a, c);
2608 mul_strassen(Cr, Ar, B);
2609 }
2610
2611 if (b > 2*anc) /* last col of A by last row of B -> C */
2612 {
2613 const_mat_window_zz_p Ac(A, 0, 2*anc, 2*anr, b);
2614 const_mat_window_zz_p Br(B, 2*bnr, 0, b, 2*bnc);
2615 mat_window_zz_p Cb(C, 0, 0, 2*anr, 2*bnc);
2616
2617 // Cb += Ac*Br
2618 mat_zz_p tmp;
2619 tmp.SetDims(Cb.NumRows(), Cb.NumCols());
2620 mul_strassen(tmp, Ac, Br);
2621 add(Cb, Cb, tmp);
2622 }
2623 }
2624
2625
2626
2627
2628
2629
2630
2631 static
2632 void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
2633 {
2634 long n = A.NumRows();
2635 long l = A.NumCols();
2636 long m = B.NumCols();
2637
2638 if (l != B.NumRows())
2639 LogicError("matrix mul: dimension mismatch");
2640
2641 X.SetDims(n, m);
2642
2643 if (n == 0 || l == 0 || m == 0) {
2644 clear(X);
2645 return;
2646 }
2647
2648 mul_strassen(X, A, B);
2649 }
2650
2651
2652 void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
2653 {
2654 if (&X == &A || &X == &B) {
2655 mat_zz_p tmp;
2656 mul_aux(tmp, A, B);
2657 X = tmp;
2658 }
2659 else
2660 mul_aux(X, A, B);
2661 }
2662
2663
2664 // ******************************************************************
2665 //
2666 // Matrix inversion code
2667 //
2668 // ******************************************************************
2669
2670 static
2671 long relaxed_InvModStatus(long& x, long a, long n, bool relax)
2672 {
2673 if (relax) {
2674 return InvModStatus(x, a, n);
2675 }
2676 else {
2677 x = InvMod(a, n);
2678 return 0;
2679 }
2680 }
2681
2682 static
2683 void basic_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
2684 {
2685 long n = A.NumRows();
2686
2687 if (A.NumCols() != n)
2688 LogicError("inv: nonsquare matrix");
2689
2690 if (n == 0) {
2691 set(d);
2692 X.SetDims(0, 0);
2693 return;
2694 }
2695
2696
2697 Mat<long> M;
2698 conv(M, A);
2699 // scratch space
2700
2701 Vec<long> P;
2702 P.SetLength(n);
2703 for (long k = 0; k < n; k++) P[k] = k;
2704 // records swap operations
2705
2706 long det;
2707 det = 1;
2708
2709 long p = zz_p::modulus();
2710 mulmod_t pinv = zz_p::ModulusInverse();
2711
2712 bool seq = n < PAR_THRESH_SQ;
2713
2714 bool pivoting = false;
2715
2716 for (long k = 0; k < n; k++) {
2717 long pos = -1;
2718 long pivot_inv;
2719 for (long i = k; i < n; i++) {
2720 // NOTE: by using InvModStatus, this code will work
2721 // for prime-powers as well as primes
2722 long pivot = M[i][k];
2723 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
2724 pos = i;
2725 break;
2726 }
2727 }
2728
2729 if (pos != -1) {
2730 if (k != pos) {
2731 swap(M[pos], M[k]);
2732 det = NegateMod(det, p);
2733 P[k] = pos;
2734 pivoting = true;
2735 }
2736
2737 det = MulMod(det, M[k][k], p);
2738
2739 {
2740 // multiply row k by pivot_inv
2741 long t1 = pivot_inv;
2742 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
2743 long * NTL_RESTRICT y = &M[k][0];
2744 for (long j = 0; j < n; j++)
2745 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
2746
2747 y[k] = pivot_inv;
2748 }
2749
2750
2751
2752 NTL_GEXEC_RANGE(seq, n, first, last)
2753 NTL_IMPORT(p)
2754 NTL_IMPORT(n)
2755 NTL_IMPORT(k)
2756 long * NTL_RESTRICT y = &M[k][0];
2757 for (long i = first; i < last; i++) {
2758 if (i == k) continue; // skip row k
2759
2760 long * NTL_RESTRICT x = &M[i][0];
2761 long t1 = x[k];
2762 t1 = NegateMod(t1, p);
2763 x[k] = 0;
2764 if (t1 == 0) continue;
2765
2766 // add t1 * row k to row i
2767 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
2768
2769 for (long j = 0; j < n; j++) {
2770 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
2771 x[j] = AddMod(x[j], t2, p);
2772 }
2773 }
2774 NTL_GEXEC_RANGE_END
2775 }
2776 else {
2777 clear(d);
2778 return;
2779 }
2780 }
2781
2782 if (pivoting) {
2783 // pivot colums, using reverse swap sequence
2784
2785 for (long i = 0; i < n; i++) {
2786 long * NTL_RESTRICT x = &M[i][0];
2787
2788 for (long k = n-1; k >= 0; k--) {
2789 long pos = P[k];
2790 if (pos != k) _ntl_swap(x[pos], x[k]);
2791 }
2792 }
2793 }
2794
2795 X.SetDims(n, n);
2796 for (long i = 0; i < n; i++)
2797 for (long j = 0; j < n; j++)
2798 X[i][j].LoopHole() = M[i][j];
2799
2800 d.LoopHole() = det;
2801 }
2802
2803
2804
2805 #ifdef NTL_HAVE_LL_TYPE
2806
2807
2808
2809 static
2810 void alt_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
2811 {
2812 long n = A.NumRows();
2813
2814 if (A.NumCols() != n)
2815 LogicError("inv: nonsquare matrix");
2816
2817 if (n == 0) {
2818 set(d);
2819 X.SetDims(0, 0);
2820 return;
2821 }
2822
2823
2824 Mat<unsigned long> M;
2825 conv(M, A);
2826 // scractch space
2827
2828 Vec<long> P;
2829 P.SetLength(n);
2830 for (long k = 0; k < n; k++) P[k] = k;
2831 // records swap operations
2832
2833 long det;
2834 det = 1;
2835
2836 long p = zz_p::modulus();
2837 mulmod_t pinv = zz_p::ModulusInverse();
2838 sp_reduce_struct red_struct = zz_p::red_struct();
2839
2840
2841
2842 bool seq = n < PAR_THRESH_SQ;
2843
2844 bool pivoting = false;
2845
2846 unsigned long ured_trigger =
2847 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
2848 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
2849
2850 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
2851
2852 long red_count = red_trigger;
2853
2854
2855 for (long k = 0; k < n; k++) {
2856 bool cleanup = false;
2857
2858 if (red_count-1 < 0) {
2859 red_count = red_trigger;
2860 cleanup = true;
2861 }
2862
2863 red_count = red_count-1;
2864
2865 long pos = -1;
2866 long pivot;
2867 long pivot_inv;
2868
2869 for (long i = k; i < n; i++) {
2870 // NOTE: by using InvModStatus, this code will work
2871 // for prime-powers as well as primes
2872 pivot = rem(M[i][k], p, red_struct);
2873 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
2874 pos = i;
2875 break;
2876 }
2877 }
2878
2879 if (pos != -1) {
2880 if (k != pos) {
2881 swap(M[pos], M[k]);
2882 det = NegateMod(det, p);
2883 P[k] = pos;
2884 pivoting = true;
2885 }
2886
2887 det = MulMod(det, pivot, p);
2888
2889 {
2890 // multiply row k by pivot_inv
2891 long t1 = pivot_inv;
2892 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
2893 unsigned long * NTL_RESTRICT y = &M[k][0];
2894 for (long j = 0; j < n; j++) {
2895 long t2 = rem(y[j], p, red_struct);
2896 y[j] = MulModPrecon(t2, t1, p, t1pinv);
2897 }
2898
2899 y[k] = pivot_inv;
2900 }
2901
2902
2903 NTL_GEXEC_RANGE(seq, n, first, last)
2904 NTL_IMPORT(p)
2905 NTL_IMPORT(n)
2906 NTL_IMPORT(k)
2907 NTL_IMPORT(red_struct)
2908 unsigned long * NTL_RESTRICT y = &M[k][0];
2909 if (cleanup) {
2910 for (long i = first; i < last; i++) {
2911 if (i == k) continue;
2912 // skip row k: the data won't change, but it
2913 // technically is a race condition in a multi-theaded
2914 // execution, and it would violate the "restrict"
2915 // contract
2916
2917 unsigned long * NTL_RESTRICT x = &M[i][0];
2918 for (long j = 0; j < n; j++) {
2919 x[j] = rem(x[j], p, red_struct);
2920 }
2921 }
2922 }
2923
2924
2925 for (long i = first; i < last; i++) {
2926 if (i == k) continue; // skip row k
2927
2928 unsigned long * NTL_RESTRICT x = &M[i][0];
2929 long t1 = rem(x[k], p, red_struct);
2930 t1 = NegateMod(t1, p);
2931 x[k] = 0;
2932 if (t1 == 0) continue;
2933
2934 // add t1 * row k to row i
2935 unsigned long ut1 = t1;
2936 long j;
2937 for (j = 0; j <= n-4; j+=4) {
2938 unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
2939 unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
2940 unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
2941 unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
2942 x[j+0] = xj0;
2943 x[j+1] = xj1;
2944 x[j+2] = xj2;
2945 x[j+3] = xj3;
2946 }
2947 for (; j < n; j++) {
2948 x[j] += DO_MUL(y[j], ut1);
2949 }
2950 }
2951 NTL_GEXEC_RANGE_END
2952 }
2953 else {
2954 clear(d);
2955 return;
2956 }
2957 }
2958
2959 if (pivoting) {
2960 // pivot colums, using reverse swap sequence
2961
2962 for (long i = 0; i < n; i++) {
2963 unsigned long * NTL_RESTRICT x = &M[i][0];
2964
2965 for (long k = n-1; k >= 0; k--) {
2966 long pos = P[k];
2967 if (pos != k) _ntl_swap(x[pos], x[k]);
2968 }
2969 }
2970 }
2971
2972 X.SetDims(n, n);
2973 for (long i = 0; i < n; i++)
2974 for (long j = 0; j < n; j++)
2975 X[i][j].LoopHole() = rem(M[i][j], p, red_struct);
2976
2977 d.LoopHole() = det;
2978 }
2979
2980
2981
2982
2983
2984 #ifdef NTL_HAVE_AVX
2985
2986 static
2987 void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
2988 {
2989 long n = A.NumRows();
2990
2991 if (A.NumCols() != n)
2992 LogicError("inv: nonsquare matrix");
2993
2994 if (n == 0) {
2995 set(d);
2996 X.SetDims(0, 0);
2997 return;
2998 }
2999
3000 Vec< AlignedArray<double> > M;
3001 M.SetLength(n);
3002 for (long i = 0; i < n; i++) M[i].SetLength(n);
3003
3004 for (long i = 0; i < n; i++) {
3005 for (long j = 0; j < n; j++)
3006 M[i][j] = rep(A[i][j]);
3007 }
3008
3009
3010 Vec<long> P;
3011 P.SetLength(n);
3012 for (long k = 0; k < n; k++) P[k] = k;
3013 // records swap operations
3014
3015 long det;
3016 det = 1;
3017
3018 long p = zz_p::modulus();
3019 mulmod_t pinv = zz_p::ModulusInverse();
3020 sp_reduce_struct red_struct = zz_p::red_struct();
3021
3022
3023
3024 bool seq = n < PAR_THRESH_SQ;
3025
3026 bool pivoting = false;
3027
3028 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3029 long red_count = red_trigger;
3030
3031 for (long k = 0; k < n; k++) {
3032 bool cleanup = false;
3033
3034 if (red_count-1 < 0) {
3035 red_count = red_trigger;
3036 cleanup = true;
3037 }
3038
3039 red_count = red_count-1;
3040
3041 long pos = -1;
3042 long pivot;
3043 long pivot_inv;
3044
3045
3046
3047 for (long i = k; i < n; i++) {
3048 // NOTE: by using InvModStatus, this code will work
3049 // for prime-powers as well as primes
3050 pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
3051 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3052 pos = i;
3053 break;
3054 }
3055 }
3056
3057 if (pos != -1) {
3058 if (k != pos) {
3059 swap(M[pos], M[k]);
3060 det = NegateMod(det, p);
3061 P[k] = pos;
3062 pivoting = true;
3063 }
3064
3065 det = MulMod(det, pivot, p);
3066
3067 {
3068 // multiply row k by pivot_inv
3069 long t1 = pivot_inv;
3070 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3071 double * NTL_RESTRICT y = &M[k][0];
3072 for (long j = 0; j < n; j++) {
3073 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
3074 y[j] = MulModPrecon(t2, t1, p, t1pinv);
3075 }
3076
3077 y[k] = pivot_inv;
3078 }
3079
3080
3081 NTL_GEXEC_RANGE(seq, n, first, last)
3082 NTL_IMPORT(p)
3083 NTL_IMPORT(n)
3084 NTL_IMPORT(k)
3085 NTL_IMPORT(red_struct)
3086 double * NTL_RESTRICT y = &M[k][0];
3087 if (cleanup) {
3088 for (long i = first; i < last; i++) {
3089 if (i == k) continue;
3090 // skip row k: the data won't change, but it
3091 // technically is a race condition in a multi-theaded
3092 // execution, and it would violate the "restrict"
3093 // contract
3094
3095 double * NTL_RESTRICT x = &M[i][0];
3096 for (long j = 0; j < n; j++) {
3097 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
3098 }
3099 }
3100 }
3101
3102
3103 for (long i = first; i < last; i++) {
3104 if (i == k) continue; // skip row k
3105
3106 double * NTL_RESTRICT x = &M[i][0];
3107 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
3108 t1 = NegateMod(t1, p);
3109 x[k] = 0;
3110 if (t1 == 0) continue;
3111
3112 // add t1 * row k to row i
3113 double ut1 = t1;
3114 muladd_interval1(x, y, ut1, n);
3115 }
3116 NTL_GEXEC_RANGE_END
3117 }
3118 else {
3119 clear(d);
3120 return;
3121 }
3122 }
3123
3124
3125 if (pivoting) {
3126 // pivot colums, using reverse swap sequence
3127
3128 for (long i = 0; i < n; i++) {
3129 double * NTL_RESTRICT x = &M[i][0];
3130
3131 for (long k = n-1; k >= 0; k--) {
3132 long pos = P[k];
3133 if (pos != k) _ntl_swap(x[pos], x[k]);
3134 }
3135 }
3136 }
3137
3138
3139 X.SetDims(n, n);
3140 for (long i = 0; i < n; i++)
3141 for (long j = 0; j < n; j++)
3142 X[i][j].LoopHole() = rem((unsigned long)(long)M[i][j], p, red_struct);
3143
3144 d.LoopHole() = det;
3145 }
3146
3147 #endif
3148
3149
3150
3151
3152
3153 #ifdef NTL_HAVE_AVX
3154
3155 static
3156 void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3157 {
3158 long n = A.NumRows();
3159
3160 if (A.NumCols() != n)
3161 LogicError("inv: nonsquare matrix");
3162
3163 if (n == 0) {
3164 set(d);
3165 X.SetDims(0, 0);
3166 return;
3167 }
3168
3169 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
3170
3171 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3172
3173
3174 Vec< AlignedArray<double> > M;
3175 M.SetLength(npanels);
3176 for (long panel = 0; panel < npanels; panel++) {
3177 M[panel].SetLength(n*MAT_BLK_SZ);
3178 double *panelp = &M[panel][0];
3179
3180 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
3181 }
3182
3183 // copy A into panels
3184 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3185 long j_max = min(jj+MAT_BLK_SZ, n);
3186 double *panelp = &M[panel][0];
3187
3188 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3189 const zz_p *ap = A[i].elts() + jj;
3190
3191 for (long j = jj; j < j_max; j++)
3192 panelp[j-jj] = rep(ap[j-jj]);
3193 }
3194 }
3195
3196 Vec<long> P;
3197 P.SetLength(n);
3198 for (long k = 0; k < n; k++) P[k] = k;
3199 // records swap operations
3200
3201
3202 long det;
3203 det = 1;
3204
3205 long p = zz_p::modulus();
3206 mulmod_t pinv = zz_p::ModulusInverse();
3207 sp_reduce_struct red_struct = zz_p::red_struct();
3208
3209
3210 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
3211
3212 bool pivoting = false;
3213
3214 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3215 long red_count = red_trigger;
3216
3217 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
3218 long k_max = min(kk+MAT_BLK_SZ, n);
3219
3220 bool cleanup = false;
3221
3222 if (red_count-MAT_BLK_SZ < 0) {
3223 red_count = red_trigger;
3224 cleanup = true;
3225 }
3226
3227 red_count = red_count-MAT_BLK_SZ;
3228 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
3229
3230 if (cleanup) {
3231 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3232 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
3233 }
3234
3235 for (long k = kk; k < k_max; k++) {
3236
3237 long pos = -1;
3238 long pivot;
3239 long pivot_inv;
3240
3241 for (long i = k; i < n; i++) {
3242 // NOTE: by using InvModStatus, this code will work
3243 // for prime-powers as well as primes
3244 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
3245 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3246 pos = i;
3247 break;
3248 }
3249 }
3250
3251 if (pos == -1) {
3252 clear(d);
3253 return;
3254 }
3255
3256 double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
3257 if (k != pos) {
3258 // swap rows pos and k
3259 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
3260 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
3261
3262 det = NegateMod(det, p);
3263 P[k] = pos;
3264 pivoting = true;
3265 }
3266
3267 det = MulMod(det, pivot, p);
3268
3269 {
3270 // multiply row k by pivot_inv
3271 long t1 = pivot_inv;
3272 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3273 for (long j = 0; j < MAT_BLK_SZ; j++) {
3274 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
3275 y[j] = MulModPrecon(t2, t1, p, t1pinv);
3276 }
3277
3278 y[k-kk] = pivot_inv;
3279 }
3280
3281 for (long i = 0; i < n; i++) {
3282 if (i == k) continue; // skip row k
3283
3284 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
3285 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
3286 t1 = NegateMod(t1, p);
3287 x[k-kk] = 0;
3288 if (t1 == 0) continue;
3289
3290 // add t1 * row k to row i
3291 double ut1 = t1;
3292 muladd_interval(x, y, ut1, MAT_BLK_SZ);
3293 }
3294 }
3295
3296
3297 // finished processing current kpanel
3298 // next, reduce and apply to all other kpanels
3299
3300 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3301 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
3302
3303 // special processing: subtract 1 off of diangonal
3304
3305 for (long k = kk; k < k_max; k++)
3306 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3307
3308
3309 NTL_GEXEC_RANGE(seq, npanels, first, last)
3310 NTL_IMPORT(p)
3311 NTL_IMPORT(n)
3312 NTL_IMPORT(red_struct)
3313 NTL_IMPORT(kpanel)
3314 NTL_IMPORT(kpanelp)
3315 NTL_IMPORT(kk)
3316 NTL_IMPORT(k_max)
3317
3318
3319 AlignedArray<double> buf_store;
3320 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3321 double *buf = &buf_store[0];
3322
3323 for (long jpanel = first; jpanel < last; jpanel++) {
3324 if (jpanel == kpanel) continue;
3325
3326 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
3327
3328 if (cleanup) {
3329 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3330 jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
3331 }
3332
3333 // perform swaps
3334 for (long k = kk; k < k_max; k++) {
3335 long pos = P[k];
3336 if (pos != k) {
3337 // swap rows pos and k
3338 double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
3339 double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
3340 for (long j = 0; j < MAT_BLK_SZ; j++)
3341 _ntl_swap(pos_p[j], k_p[j]);
3342 }
3343 }
3344
3345 // copy block number kpanel (the one on the diagonal) into buf
3346
3347 for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
3348 buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
3349
3350 // jpanel += kpanel*buf
3351
3352 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
3353 }
3354
3355 NTL_GEXEC_RANGE_END
3356
3357 // special processing: add 1 back to the diangonal
3358
3359 for (long k = kk; k < k_max; k++)
3360 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3361
3362 }
3363
3364 if (pivoting) {
3365 // pivot colums, using reverse swap sequence
3366
3367 for (long k = n-1; k >= 0; k--) {
3368 long pos = P[k];
3369 if (pos != k) {
3370 // swap columns pos and k
3371
3372 double * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
3373 double * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
3374 for (long i = 0; i < n; i++) {
3375 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
3376 }
3377 }
3378 }
3379 }
3380
3381
3382 // copy panels into X
3383 X.SetDims(n, n);
3384 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3385 long j_max = min(jj+MAT_BLK_SZ, n);
3386 double *panelp = &M[panel][0];
3387
3388 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3389 zz_p *xp = X[i].elts() + jj;
3390
3391 for (long j = jj; j < j_max; j++)
3392 xp[j-jj].LoopHole() = rem((unsigned long)(long)panelp[j-jj], p, red_struct);
3393 }
3394 }
3395
3396 d.LoopHole() = det;
3397
3398 }
3399
3400 #endif
3401
3402
3403
3404 static
3405 void blk_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3406 {
3407 long n = A.NumRows();
3408
3409 if (A.NumCols() != n)
3410 LogicError("inv: nonsquare matrix");
3411
3412 if (n == 0) {
3413 set(d);
3414 X.SetDims(0, 0);
3415 return;
3416 }
3417
3418 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
3419
3420 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3421
3422 Vec< UniqueArray<unsigned long> > M;
3423 M.SetLength(npanels);
3424 for (long panel = 0; panel < npanels; panel++) {
3425 M[panel].SetLength(n*MAT_BLK_SZ);
3426 unsigned long *panelp = &M[panel][0];
3427
3428 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
3429 }
3430
3431 // copy A into panels
3432 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3433 long j_max = min(jj+MAT_BLK_SZ, n);
3434 unsigned long *panelp = &M[panel][0];
3435
3436 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3437 const zz_p *ap = A[i].elts() + jj;
3438
3439 for (long j = jj; j < j_max; j++)
3440 panelp[j-jj] = rep(ap[j-jj]);
3441 }
3442 }
3443
3444 Vec<long> P;
3445 P.SetLength(n);
3446 for (long k = 0; k < n; k++) P[k] = k;
3447 // records swap operations
3448
3449
3450 long det;
3451 det = 1;
3452
3453 long p = zz_p::modulus();
3454 mulmod_t pinv = zz_p::ModulusInverse();
3455 sp_reduce_struct red_struct = zz_p::red_struct();
3456
3457
3458 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
3459
3460 bool pivoting = false;
3461
3462 unsigned long ured_trigger =
3463 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
3464 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
3465
3466 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
3467
3468 long red_count = red_trigger;
3469
3470 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
3471 long k_max = min(kk+MAT_BLK_SZ, n);
3472
3473 bool cleanup = false;
3474
3475 if (red_count-MAT_BLK_SZ < 0) {
3476 red_count = red_trigger;
3477 cleanup = true;
3478 }
3479
3480 red_count = red_count-MAT_BLK_SZ;
3481 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
3482
3483 if (cleanup) {
3484 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3485 kpanelp[r] = rem(kpanelp[r], p, red_struct);
3486 }
3487
3488 for (long k = kk; k < k_max; k++) {
3489
3490 long pos = -1;
3491 long pivot;
3492 long pivot_inv;
3493
3494 for (long i = k; i < n; i++) {
3495 // NOTE: by using InvModStatus, this code will work
3496 // for prime-powers as well as primes
3497 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
3498 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3499 pos = i;
3500 break;
3501 }
3502 }
3503
3504 if (pos == -1) {
3505 clear(d);
3506 return;
3507 }
3508
3509 unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
3510 if (k != pos) {
3511 // swap rows pos and k
3512 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
3513 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
3514
3515 det = NegateMod(det, p);
3516 P[k] = pos;
3517 pivoting = true;
3518 }
3519
3520 det = MulMod(det, pivot, p);
3521
3522 {
3523 // multiply row k by pivot_inv
3524 long t1 = pivot_inv;
3525 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3526 for (long j = 0; j < MAT_BLK_SZ; j++) {
3527 long t2 = rem(y[j], p, red_struct);
3528 y[j] = MulModPrecon(t2, t1, p, t1pinv);
3529 }
3530
3531 y[k-kk] = pivot_inv;
3532 }
3533
3534 for (long i = 0; i < n; i++) {
3535 if (i == k) continue; // skip row k
3536
3537 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
3538 long t1 = rem(x[k-kk], p, red_struct);
3539 t1 = NegateMod(t1, p);
3540 x[k-kk] = 0;
3541 if (t1 == 0) continue;
3542
3543 // add t1 * row k to row i
3544 unsigned long ut1 = t1;
3545 muladd_interval(x, y, ut1, MAT_BLK_SZ);
3546 }
3547 }
3548
3549
3550 // finished processing current kpanel
3551 // next, reduce and apply to all other kpanels
3552
3553 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3554 kpanelp[r] = rem(kpanelp[r], p, red_struct);
3555
3556 // special processing: subtract 1 off of diangonal
3557
3558 for (long k = kk; k < k_max; k++)
3559 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3560
3561
3562 NTL_GEXEC_RANGE(seq, npanels, first, last)
3563 NTL_IMPORT(p)
3564 NTL_IMPORT(n)
3565 NTL_IMPORT(red_struct)
3566 NTL_IMPORT(kpanel)
3567 NTL_IMPORT(kpanelp)
3568 NTL_IMPORT(kk)
3569 NTL_IMPORT(k_max)
3570
3571
3572 UniqueArray<unsigned long> buf_store;
3573 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3574 unsigned long *buf = &buf_store[0];
3575
3576 for (long jpanel = first; jpanel < last; jpanel++) {
3577 if (jpanel == kpanel) continue;
3578
3579 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
3580
3581 if (cleanup) {
3582 for (long r = 0; r < n*MAT_BLK_SZ; r++)
3583 jpanelp[r] = rem(jpanelp[r], p, red_struct);
3584 }
3585
3586 // perform swaps
3587 for (long k = kk; k < k_max; k++) {
3588 long pos = P[k];
3589 if (pos != k) {
3590 // swap rows pos and k
3591 unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
3592 unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
3593 for (long j = 0; j < MAT_BLK_SZ; j++)
3594 _ntl_swap(pos_p[j], k_p[j]);
3595 }
3596 }
3597
3598 // copy block number kpanel (the one on the diagonal) into buf
3599 // here, we transpose it
3600
3601 for (long k = kk; k < k_max; k++)
3602 for (long j = 0; j < MAT_BLK_SZ; j++)
3603 buf[j*MAT_BLK_SZ + (k-kk)] =
3604 rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
3605
3606 // jpanel += kpanel*buf
3607
3608 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
3609 }
3610
3611 NTL_GEXEC_RANGE_END
3612
3613 // special processing: add 1 back to the diangonal
3614
3615 for (long k = kk; k < k_max; k++)
3616 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3617
3618 }
3619
3620 if (pivoting) {
3621 // pivot colums, using reverse swap sequence
3622
3623 for (long k = n-1; k >= 0; k--) {
3624 long pos = P[k];
3625 if (pos != k) {
3626 // swap columns pos and k
3627
3628 unsigned long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
3629 unsigned long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
3630 for (long i = 0; i < n; i++) {
3631 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
3632 }
3633 }
3634 }
3635 }
3636
3637 // copy panels into X
3638 X.SetDims(n, n);
3639 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3640 long j_max = min(jj+MAT_BLK_SZ, n);
3641 unsigned long *panelp = &M[panel][0];
3642
3643 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3644 zz_p *xp = X[i].elts() + jj;
3645
3646 for (long j = jj; j < j_max; j++)
3647 xp[j-jj].LoopHole() = rem(panelp[j-jj], p, red_struct);
3648 }
3649 }
3650
3651 d.LoopHole() = det;
3652
3653 }
3654
3655
3656
3657
3658
3659
3660
3661
3662 static
3663 void blk_inv_LL(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3664 {
3665 long n = A.NumRows();
3666
3667 if (A.NumCols() != n)
3668 LogicError("inv: nonsquare matrix");
3669
3670 if (n == 0) {
3671 set(d);
3672 X.SetDims(0, 0);
3673 return;
3674 }
3675
3676 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too big");
3677
3678 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3679
3680 Vec< UniqueArray<long> > M;
3681 M.SetLength(npanels);
3682 for (long panel = 0; panel < npanels; panel++) {
3683 M[panel].SetLength(n*MAT_BLK_SZ);
3684 long *panelp = &M[panel][0];
3685
3686 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
3687 }
3688
3689
3690 // copy A into panels
3691 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3692 long j_max = min(jj+MAT_BLK_SZ, n);
3693 long *panelp = &M[panel][0];
3694
3695 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3696 const zz_p *ap = A[i].elts() + jj;
3697
3698 for (long j = jj; j < j_max; j++)
3699 panelp[j-jj] = rep(ap[j-jj]);
3700 }
3701 }
3702
3703 Vec<long> P;
3704 P.SetLength(n);
3705 for (long k = 0; k < n; k++) P[k] = k;
3706 // records swap operations
3707
3708
3709 long det;
3710 det = 1;
3711
3712 long p = zz_p::modulus();
3713 mulmod_t pinv = zz_p::ModulusInverse();
3714 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
3715
3716
3717 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
3718
3719 bool pivoting = false;
3720
3721 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
3722 long k_max = min(kk+MAT_BLK_SZ, n);
3723
3724 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
3725
3726
3727 for (long k = kk; k < k_max; k++) {
3728
3729 long pos = -1;
3730 long pivot;
3731 long pivot_inv;
3732
3733 for (long i = k; i < n; i++) {
3734 // NOTE: by using InvModStatus, this code will work
3735 // for prime-powers as well as primes
3736 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
3737 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3738 pos = i;
3739 break;
3740 }
3741 }
3742
3743 if (pos == -1) {
3744 clear(d);
3745 return;
3746 }
3747
3748 long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
3749 if (k != pos) {
3750 // swap rows pos and k
3751 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
3752 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
3753
3754 det = NegateMod(det, p);
3755 P[k] = pos;
3756 pivoting = true;
3757 }
3758
3759 det = MulMod(det, pivot, p);
3760
3761 {
3762 // multiply row k by pivot_inv
3763 long t1 = pivot_inv;
3764 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3765 for (long j = 0; j < MAT_BLK_SZ; j++) {
3766 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
3767 }
3768
3769 y[k-kk] = pivot_inv;
3770 }
3771
3772 for (long i = 0; i < n; i++) {
3773 if (i == k) continue; // skip row k
3774
3775 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
3776 long t1 = x[k-kk];
3777 t1 = NegateMod(t1, p);
3778 x[k-kk] = 0;
3779 if (t1 == 0) continue;
3780
3781 // add t1 * row k to row i
3782 long ut1 = t1;
3783 muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
3784 }
3785 }
3786
3787
3788 // finished processing current kpanel
3789 // next, reduce and apply to all other kpanels
3790
3791 // special processing: subtract 1 off of diangonal
3792
3793 for (long k = kk; k < k_max; k++)
3794 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3795
3796
3797 NTL_GEXEC_RANGE(seq, npanels, first, last)
3798 NTL_IMPORT(p)
3799 NTL_IMPORT(n)
3800 NTL_IMPORT(ll_red_struct)
3801 NTL_IMPORT(kpanel)
3802 NTL_IMPORT(kpanelp)
3803 NTL_IMPORT(kk)
3804 NTL_IMPORT(k_max)
3805
3806
3807 UniqueArray<long> buf_store;
3808 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3809 long *buf = &buf_store[0];
3810
3811 for (long jpanel = first; jpanel < last; jpanel++) {
3812 if (jpanel == kpanel) continue;
3813
3814 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
3815
3816 // perform swaps
3817 for (long k = kk; k < k_max; k++) {
3818 long pos = P[k];
3819 if (pos != k) {
3820 // swap rows pos and k
3821 long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
3822 long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
3823 for (long j = 0; j < MAT_BLK_SZ; j++)
3824 _ntl_swap(pos_p[j], k_p[j]);
3825 }
3826 }
3827
3828 // copy block number kpanel (the one on the diagonal) into buf
3829 // here, we transpose it
3830
3831 for (long k = kk; k < k_max; k++)
3832 for (long j = 0; j < MAT_BLK_SZ; j++)
3833 buf[j*MAT_BLK_SZ + (k-kk)] =
3834 jpanelp[k*MAT_BLK_SZ+j];
3835
3836
3837 // jpanel += kpanel*buf
3838
3839 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
3840 }
3841
3842 NTL_GEXEC_RANGE_END
3843
3844 // special processing: add 1 back to the diangonal
3845
3846 for (long k = kk; k < k_max; k++)
3847 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
3848
3849 }
3850
3851 if (pivoting) {
3852 // pivot colums, using reverse swap sequence
3853
3854 for (long k = n-1; k >= 0; k--) {
3855 long pos = P[k];
3856 if (pos != k) {
3857 // swap columns pos and k
3858
3859 long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
3860 long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
3861 for (long i = 0; i < n; i++) {
3862 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
3863 }
3864 }
3865 }
3866 }
3867
3868 // copy panels into X
3869 X.SetDims(n, n);
3870 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
3871 long j_max = min(jj+MAT_BLK_SZ, n);
3872 long *panelp = &M[panel][0];
3873
3874 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
3875 zz_p *xp = X[i].elts() + jj;
3876
3877 for (long j = jj; j < j_max; j++)
3878 xp[j-jj].LoopHole() = panelp[j-jj];
3879 }
3880 }
3881
3882 d.LoopHole() = det;
3883
3884 }
3885
3886
3887
3888 #endif
3889
3890
3891
3892 void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3893 {
3894 long n = A.NumRows();
3895
3896 if (A.NumCols() != n)
3897 LogicError("inv: nonsquare matrix");
3898
3899 #ifndef NTL_HAVE_LL_TYPE
3900
3901 basic_inv(d, X, A, relax);
3902
3903 #else
3904
3905 long p = zz_p::modulus();
3906
3907 if (n < 16) {
3908 //cerr << "basic_inv\n";
3909 basic_inv(d, X, A, relax);
3910 }
3911 else if (n/MAT_BLK_SZ < 4) {
3912 long V = 64;
3913
3914 #ifdef NTL_HAVE_AVX
3915 if (p-1 <= MAX_DBL_INT &&
3916 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3917 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
3918
3919 //cerr << "alt_inv_DD\n";
3920 alt_inv_DD(d, X, A, relax);
3921 }
3922 else
3923 #endif
3924 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
3925 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
3926
3927 //cerr << "alt_inv_L\n";
3928 alt_inv_L(d, X, A, relax);
3929
3930 }
3931 else {
3932
3933 //cerr << "basic_inv\n";
3934 basic_inv(d, X, A, relax);
3935 }
3936 }
3937 else {
3938 long V = 4*MAT_BLK_SZ;
3939
3940 #ifdef NTL_HAVE_AVX
3941 if (p-1 <= MAX_DBL_INT &&
3942 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3943 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
3944
3945 //cerr << "blk_inv_DD\n";
3946 blk_inv_DD(d, X, A, relax);
3947 }
3948 else
3949 #endif
3950 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
3951 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
3952
3953 //cerr << "blk_inv_L\n";
3954 blk_inv_L(d, X, A, relax);
3955
3956 }
3957 else {
3958
3959 //cerr << "blk_inv_LL\n";
3960 blk_inv_LL(d, X, A, relax);
3961 }
3962
3963 }
3964
3965 #endif
3966
3967
3968
3969 }
3970
3971
3972
3973 // ******************************************************************
3974 //
3975 // Triangularizing square matrices, with applications
3976 // to solving linear systems and computing determinants.
3977 // Should be about 3x faster than the matrix inverse
3978 // algorithms.
3979 //
3980 // ******************************************************************
3981
3982
3983 static
3984 void basic_tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
3985 vec_zz_p *xp, bool trans, bool relax)
3986 {
3987 long n = A.NumRows();
3988
3989 // adjust
3990 if (A.NumCols() != n)
3991 LogicError("tri: nonsquare matrix");
3992
3993 // adjust
3994 if (bp && bp->length() != n)
3995 LogicError("tri: dimension mismatch");
3996
3997 // adjust
3998 if (bp && !xp)
3999 LogicError("tri: bad args");
4000
4001 if (n == 0) {
4002 set(d);
4003 // adjust
4004 if (xp) xp->SetLength(0);
4005 return;
4006 }
4007
4008 // adjust (several lines)
4009 // scratch space
4010 Mat<long> M;
4011 if (!trans) {
4012 conv(M, A);
4013 }
4014 else {
4015 M.SetDims(n, n);
4016 for (long i = 0; i < n; i++)
4017 for (long j = 0; j < n; j++)
4018 M[i][j] = rep(A[j][i]);
4019 }
4020
4021 Vec<long> bv;
4022 if (bp) conv(bv, *bp);
4023 // end adjust
4024
4025
4026 Vec<long> P;
4027 P.SetLength(n);
4028 for (long k = 0; k < n; k++) P[k] = k;
4029 // records swap operations
4030
4031 long det;
4032 det = 1;
4033
4034 long p = zz_p::modulus();
4035 mulmod_t pinv = zz_p::ModulusInverse();
4036
4037
4038 bool pivoting = false;
4039
4040 for (long k = 0; k < n; k++) {
4041 long pos = -1;
4042 long pivot_inv;
4043 for (long i = k; i < n; i++) {
4044 // NOTE: by using InvModStatus, this code will work
4045 // for prime-powers as well as primes
4046 long pivot = M[i][k];
4047 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4048 pos = i;
4049 break;
4050 }
4051 }
4052
4053 if (pos != -1) {
4054 if (k != pos) {
4055 swap(M[pos], M[k]);
4056 det = NegateMod(det, p);
4057 P[k] = pos;
4058 pivoting = true;
4059
4060 // adjust
4061 if (bp) _ntl_swap(bv[pos], bv[k]);
4062 }
4063
4064 det = MulMod(det, M[k][k], p);
4065
4066 {
4067 // multiply row k by pivot_inv
4068 long t1 = pivot_inv;
4069 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4070 long * NTL_RESTRICT y = &M[k][0];
4071 // adjust
4072 for (long j = k+1; j < n; j++)
4073 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
4074
4075 // adjust // y[k] = pivot_inv;
4076
4077 // adjust
4078 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
4079 }
4080
4081
4082
4083 // adjust
4084 bool seq = n-(k+1) < PAR_THRESH_SQ;
4085 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
4086 NTL_IMPORT(p)
4087 NTL_IMPORT(n)
4088 NTL_IMPORT(k)
4089 long * NTL_RESTRICT y = &M[k][0];
4090
4091 // adjust
4092 for (long ii = first; ii < last; ii++) {
4093 long i = ii + k+1;
4094
4095 long * NTL_RESTRICT x = &M[i][0];
4096 long t1 = x[k];
4097 t1 = NegateMod(t1, p);
4098 // adjust // x[k] = 0;
4099 if (t1 == 0) continue;
4100
4101 // add t1 * row k to row i
4102 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4103
4104 // adjust
4105 for (long j = k+1; j < n; j++) {
4106 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
4107 x[j] = AddMod(x[j], t2, p);
4108 }
4109
4110 // adjust
4111 if (bp)
4112 {
4113 long t2 = MulModPrecon(bv[k], t1, p, t1pinv);
4114 bv[i] = AddMod(bv[i], t2, p);
4115 }
4116 }
4117 NTL_GEXEC_RANGE_END
4118 }
4119 else {
4120 clear(d);
4121 return;
4122 }
4123 }
4124
4125
4126 // adjust
4127 if (bp) {
4128 xp->SetLength(n);
4129 zz_p *X = xp->elts();
4130
4131 for (long i = n-1; i >= 0; i--) {
4132 long t1 = 0;
4133 for (long j = i+1; j < n; j++) {
4134 long t2 = MulMod(rep(X[j]), M[i][j], p);
4135 t1 = AddMod(t1, t2, p);
4136 }
4137 X[i].LoopHole() = SubMod(bv[i], t1, p);
4138 }
4139 }
4140
4141 d.LoopHole() = det;
4142 }
4143
4144
4145
4146
4147 #ifdef NTL_HAVE_LL_TYPE
4148
4149
4150
4151 static
4152 void alt_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4153 vec_zz_p *xp, bool trans, bool relax)
4154 {
4155 long n = A.NumRows();
4156
4157 if (A.NumCols() != n)
4158 LogicError("tri: nonsquare matrix");
4159
4160 // adjust
4161 if (bp && bp->length() != n)
4162 LogicError("tri: dimension mismatch");
4163
4164 // adjust
4165 if (bp && !xp)
4166 LogicError("tri: bad args");
4167
4168 if (n == 0) {
4169 set(d);
4170 if (xp) xp->SetLength(0);
4171 return;
4172 }
4173
4174
4175 // scratch space
4176 Mat<unsigned long> M;
4177 if (!trans) {
4178 conv(M, A);
4179 }
4180 else {
4181 M.SetDims(n, n);
4182 for (long i = 0; i < n; i++)
4183 for (long j = 0; j < n; j++)
4184 M[i][j] = rep(A[j][i]);
4185 }
4186
4187 Vec<long> bv;
4188 if (bp) conv(bv, *bp);
4189
4190 Vec<long> P;
4191 P.SetLength(n);
4192 for (long k = 0; k < n; k++) P[k] = k;
4193 // records swap operations
4194
4195 long det;
4196 det = 1;
4197
4198 long p = zz_p::modulus();
4199 mulmod_t pinv = zz_p::ModulusInverse();
4200 sp_reduce_struct red_struct = zz_p::red_struct();
4201
4202
4203
4204 bool pivoting = false;
4205
4206 unsigned long ured_trigger =
4207 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
4208 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
4209
4210 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
4211
4212 long red_count = red_trigger;
4213
4214
4215 for (long k = 0; k < n; k++) {
4216 bool cleanup = false;
4217
4218 if (red_count-1 < 0) {
4219 red_count = red_trigger;
4220 cleanup = true;
4221 }
4222
4223 red_count = red_count-1;
4224
4225 long pos = -1;
4226 long pivot;
4227 long pivot_inv;
4228
4229 for (long i = k; i < n; i++) {
4230 // NOTE: by using InvModStatus, this code will work
4231 // for prime-powers as well as primes
4232 pivot = rem(M[i][k], p, red_struct);
4233 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4234 pos = i;
4235 break;
4236 }
4237 }
4238
4239 if (pos != -1) {
4240 if (k != pos) {
4241 swap(M[pos], M[k]);
4242 det = NegateMod(det, p);
4243 P[k] = pos;
4244 pivoting = true;
4245
4246 if (bp) _ntl_swap(bv[pos], bv[k]);
4247 }
4248
4249 det = MulMod(det, pivot, p);
4250
4251 {
4252 // multiply row k by pivot_inv
4253 long t1 = pivot_inv;
4254 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4255 unsigned long * NTL_RESTRICT y = &M[k][0];
4256 for (long j = k+1; j < n; j++) {
4257 long t2 = rem(y[j], p, red_struct);
4258 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4259 }
4260
4261 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
4262 }
4263
4264
4265
4266 bool seq = n-(k+1) < PAR_THRESH_SQ;
4267 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
4268 NTL_IMPORT(p)
4269 NTL_IMPORT(n)
4270 NTL_IMPORT(k)
4271 NTL_IMPORT(red_struct)
4272 unsigned long * NTL_RESTRICT y = &M[k][0];
4273 if (cleanup) {
4274 for (long ii = first; ii < last; ii++) {
4275 long i = ii + k+1;
4276
4277 unsigned long * NTL_RESTRICT x = &M[i][0];
4278 for (long j = k+1; j < n; j++) {
4279 x[j] = rem(x[j], p, red_struct);
4280 }
4281 }
4282 }
4283
4284
4285 for (long ii = first; ii < last; ii++) {
4286 long i = ii + k+1;
4287
4288 unsigned long * NTL_RESTRICT x = &M[i][0];
4289 long t1 = rem(x[k], p, red_struct);
4290 t1 = NegateMod(t1, p);
4291 if (t1 == 0) continue;
4292
4293 // add t1 * row k to row i
4294 unsigned long ut1 = t1;
4295 long j;
4296 for (j = k+1; j <= n-4; j+=4) {
4297 unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
4298 unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
4299 unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
4300 unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
4301 x[j+0] = xj0;
4302 x[j+1] = xj1;
4303 x[j+2] = xj2;
4304 x[j+3] = xj3;
4305 }
4306 for (; j < n; j++) {
4307 x[j] += DO_MUL(y[j], ut1);
4308 }
4309
4310 if (bp)
4311 {
4312 long t2 = MulMod(bv[k], t1, p);
4313 bv[i] = AddMod(bv[i], t2, p);
4314 }
4315 }
4316 NTL_GEXEC_RANGE_END
4317 }
4318 else {
4319 clear(d);
4320 return;
4321 }
4322 }
4323
4324
4325
4326 if (bp) {
4327 xp->SetLength(n);
4328 zz_p *X = xp->elts();
4329
4330 for (long i = n-1; i >= 0; i--) {
4331 long t1 = 0;
4332 for (long j = i+1; j < n; j++) {
4333 long t0 = rem(M[i][j], p, red_struct);
4334 long t2 = MulMod(rep(X[j]), t0, p);
4335 t1 = AddMod(t1, t2, p);
4336 }
4337 X[i].LoopHole() = SubMod(bv[i], t1, p);
4338 }
4339 }
4340
4341 d.LoopHole() = det;
4342 }
4343
4344
4345
4346
4347 #ifdef NTL_HAVE_AVX
4348
4349 static
4350 void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4351 vec_zz_p *xp, bool trans, bool relax)
4352 {
4353 long n = A.NumRows();
4354
4355 if (A.NumCols() != n)
4356 LogicError("tri: nonsquare matrix");
4357
4358 // adjust
4359 if (bp && bp->length() != n)
4360 LogicError("tri: dimension mismatch");
4361
4362 // adjust
4363 if (bp && !xp)
4364 LogicError("tri: bad args");
4365
4366 if (n == 0) {
4367 set(d);
4368 if (xp) xp->SetLength(0);
4369 return;
4370 }
4371
4372
4373 // scratch space
4374
4375 Vec< AlignedArray<double> > M;
4376 M.SetLength(n);
4377 for (long i = 0; i < n; i++) M[i].SetLength(n);
4378 if (!trans) {
4379 for (long i = 0; i < n; i++)
4380 for (long j = 0; j < n; j++)
4381 M[i][j] = rep(A[i][j]);
4382 }
4383 else {
4384 for (long i = 0; i < n; i++)
4385 for (long j = 0; j < n; j++)
4386 M[i][j] = rep(A[j][i]);
4387 }
4388
4389 Vec<long> bv;
4390 if (bp) conv(bv, *bp);
4391
4392 Vec<long> P;
4393 P.SetLength(n);
4394 for (long k = 0; k < n; k++) P[k] = k;
4395 // records swap operations
4396
4397 long det;
4398 det = 1;
4399
4400 long p = zz_p::modulus();
4401 mulmod_t pinv = zz_p::ModulusInverse();
4402 sp_reduce_struct red_struct = zz_p::red_struct();
4403
4404
4405
4406 bool pivoting = false;
4407
4408 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
4409 long red_count = red_trigger;
4410
4411 for (long k = 0; k < n; k++) {
4412 bool cleanup = false;
4413
4414 if (red_count-1 < 0) {
4415 red_count = red_trigger;
4416 cleanup = true;
4417 }
4418
4419 red_count = red_count-1;
4420
4421 long pos = -1;
4422 long pivot;
4423 long pivot_inv;
4424
4425 for (long i = k; i < n; i++) {
4426 // NOTE: by using InvModStatus, this code will work
4427 // for prime-powers as well as primes
4428 pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
4429 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4430 pos = i;
4431 break;
4432 }
4433 }
4434
4435 if (pos != -1) {
4436 if (k != pos) {
4437 swap(M[pos], M[k]);
4438 det = NegateMod(det, p);
4439 P[k] = pos;
4440 pivoting = true;
4441
4442 if (bp) _ntl_swap(bv[pos], bv[k]);
4443 }
4444
4445 det = MulMod(det, pivot, p);
4446
4447 {
4448 // multiply row k by pivot_inv
4449 long t1 = pivot_inv;
4450 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4451 double * NTL_RESTRICT y = &M[k][0];
4452 for (long j = k+1; j < n; j++) {
4453 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4454 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4455 }
4456
4457 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
4458 }
4459
4460
4461
4462 bool seq = n-(k+1) < PAR_THRESH_SQ;
4463 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
4464 NTL_IMPORT(p)
4465 NTL_IMPORT(n)
4466 NTL_IMPORT(k)
4467 NTL_IMPORT(red_struct)
4468 double * NTL_RESTRICT y = &M[k][0];
4469 if (cleanup) {
4470 for (long ii = first; ii < last; ii++) {
4471 long i = ii + k+1;
4472
4473 double * NTL_RESTRICT x = &M[i][0];
4474 for (long j = k+1; j < n; j++) {
4475 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
4476 }
4477 }
4478 }
4479
4480 long align_boundary =
4481 min((((k+1)+(NTL_AVX_DBL_ALIGN-1))/NTL_AVX_DBL_ALIGN)*NTL_AVX_DBL_ALIGN, n);
4482
4483
4484 for (long ii = first; ii < last; ii++) {
4485 long i = ii + k+1;
4486
4487 double * NTL_RESTRICT x = &M[i][0];
4488 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
4489 t1 = NegateMod(t1, p);
4490 if (t1 == 0) continue;
4491
4492 // add t1 * row k to row i
4493 double ut1 = t1;
4494 for (long j = k+1; j < align_boundary; j++) x[j] += y[j]*ut1;
4495 muladd_interval1(x+align_boundary, y+align_boundary, ut1, n-align_boundary);
4496
4497 if (bp)
4498 {
4499 long t2 = MulMod(bv[k], t1, p);
4500 bv[i] = AddMod(bv[i], t2, p);
4501 }
4502 }
4503 NTL_GEXEC_RANGE_END
4504 }
4505 else {
4506 clear(d);
4507 return;
4508 }
4509 }
4510
4511
4512
4513 if (bp) {
4514 xp->SetLength(n);
4515 zz_p *X = xp->elts();
4516
4517 for (long i = n-1; i >= 0; i--) {
4518 long t1 = 0;
4519 for (long j = i+1; j < n; j++) {
4520 long t0 = rem((unsigned long)(long)M[i][j], p, red_struct);
4521 long t2 = MulMod(rep(X[j]), t0, p);
4522 t1 = AddMod(t1, t2, p);
4523 }
4524 X[i].LoopHole() = SubMod(bv[i], t1, p);
4525 }
4526 }
4527
4528 d.LoopHole() = det;
4529 }
4530
4531
4532 #endif
4533
4534
4535
4536
4537 #ifdef NTL_HAVE_AVX
4538
4539 static
4540 void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4541 vec_zz_p *xp, bool trans, bool relax)
4542 {
4543 long n = A.NumRows();
4544
4545 if (A.NumCols() != n)
4546 LogicError("tri: nonsquare matrix");
4547
4548 if (bp && bp->length() != n)
4549 LogicError("tri: dimension mismatch");
4550
4551 if (bp && !xp)
4552 LogicError("tri: bad args");
4553
4554 if (n == 0) {
4555 set(d);
4556 if (xp) xp->SetLength(0);
4557 return;
4558 }
4559
4560 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4561
4562 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4563
4564 Vec< AlignedArray<double> > M;
4565 M.SetLength(npanels);
4566 for (long panel = 0; panel < npanels; panel++) {
4567 M[panel].SetLength(n*MAT_BLK_SZ);
4568 double *panelp = &M[panel][0];
4569
4570 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4571 }
4572
4573 if (trans) {
4574 // copy A transposed into panels
4575 for (long i = 0; i < n; i++) {
4576 const zz_p *row = &A[i][0];
4577 double *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
4578 for (long j = 0; j < n; j++)
4579 col[j*MAT_BLK_SZ] = rep(row[j]);
4580 }
4581 }
4582 else {
4583 // copy A into panels
4584 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4585 long j_max = min(jj+MAT_BLK_SZ, n);
4586 double *panelp = &M[panel][0];
4587
4588 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4589 const zz_p *ap = A[i].elts() + jj;
4590
4591 for (long j = jj; j < j_max; j++)
4592 panelp[j-jj] = rep(ap[j-jj]);
4593 }
4594 }
4595 }
4596
4597 Vec<long> bv;
4598 if (bp) conv(bv, *bp);
4599
4600 Vec<long> P;
4601 P.SetLength(n);
4602 for (long k = 0; k < n; k++) P[k] = k;
4603 // records swap operations
4604
4605
4606 long det;
4607 det = 1;
4608
4609 long p = zz_p::modulus();
4610 mulmod_t pinv = zz_p::ModulusInverse();
4611 sp_reduce_struct red_struct = zz_p::red_struct();
4612
4613
4614 bool pivoting = false;
4615
4616 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
4617 long red_count = red_trigger;
4618
4619 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4620 long k_max = min(kk+MAT_BLK_SZ, n);
4621
4622 bool cleanup = false;
4623
4624 if (red_count-MAT_BLK_SZ < 0) {
4625 red_count = red_trigger;
4626 cleanup = true;
4627 }
4628
4629 red_count = red_count-MAT_BLK_SZ;
4630 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
4631
4632 if (cleanup) {
4633 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
4634 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4635 }
4636
4637 for (long k = kk; k < k_max; k++) {
4638
4639 long pos = -1;
4640 long pivot;
4641 long pivot_inv;
4642
4643 for (long i = k; i < n; i++) {
4644 // NOTE: by using InvModStatus, this code will work
4645 // for prime-powers as well as primes
4646 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4647 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4648 pos = i;
4649 break;
4650 }
4651 }
4652
4653 if (pos == -1) {
4654 clear(d);
4655 return;
4656 }
4657
4658 double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
4659 if (k != pos) {
4660 // swap rows pos and k
4661 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
4662 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4663
4664 det = NegateMod(det, p);
4665 P[k] = pos;
4666 pivoting = true;
4667
4668 if (bp) _ntl_swap(bv[pos], bv[k]);
4669 }
4670
4671 det = MulMod(det, pivot, p);
4672
4673 {
4674 // multiply row k by pivot_inv
4675 long t1 = pivot_inv;
4676 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4677 for (long j = 0; j < MAT_BLK_SZ; j++) {
4678 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4679 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4680 }
4681
4682 y[k-kk] = pivot_inv;
4683
4684 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
4685 }
4686
4687 for (long i = kk; i < n; i++) {
4688 if (i == k) continue; // skip row k
4689
4690 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
4691 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
4692 t1 = NegateMod(t1, p);
4693 x[k-kk] = 0;
4694 if (t1 == 0) continue;
4695
4696 // add t1 * row k to row i
4697 double ut1 = t1;
4698 muladd_interval(x, y, ut1, MAT_BLK_SZ);
4699 if (bp)
4700 {
4701 long t2 = MulMod(bv[k], t1, p);
4702 bv[i] = AddMod(bv[i], t2, p);
4703 }
4704 }
4705 }
4706
4707
4708 // finished processing current kpanel
4709 // next, reduce and apply to all other kpanels
4710
4711 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
4712 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4713
4714 // special processing: subtract 1 off of diangonal
4715
4716 for (long k = kk; k < k_max; k++)
4717 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4718
4719
4720 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
4721
4722 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
4723 NTL_IMPORT(p)
4724 NTL_IMPORT(n)
4725 NTL_IMPORT(red_struct)
4726 NTL_IMPORT(kpanel)
4727 NTL_IMPORT(kpanelp)
4728 NTL_IMPORT(kk)
4729 NTL_IMPORT(k_max)
4730
4731
4732 AlignedArray<double> buf_store;
4733 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4734 double *buf = &buf_store[0];
4735
4736 for (long index = first; index < last; index++) {
4737 long jpanel = index + kpanel+1;
4738
4739 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
4740
4741 if (cleanup) {
4742 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
4743 jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
4744 }
4745
4746 // perform swaps
4747 for (long k = kk; k < k_max; k++) {
4748 long pos = P[k];
4749 if (pos != k) {
4750 // swap rows pos and k
4751 double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
4752 double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
4753 for (long j = 0; j < MAT_BLK_SZ; j++)
4754 _ntl_swap(pos_p[j], k_p[j]);
4755 }
4756 }
4757
4758 // copy block number kpanel (the one on the diagonal) into buf
4759
4760 for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
4761 buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
4762
4763 // jpanel += kpanel*buf
4764
4765 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
4766 }
4767
4768 NTL_GEXEC_RANGE_END
4769
4770 // special processing: add 1 back to the diangonal
4771
4772 for (long k = kk; k < k_max; k++)
4773 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4774
4775 }
4776
4777 if (bp) {
4778 xp->SetLength(n);
4779 zz_p *X = xp->elts();
4780
4781 for (long i = n-1; i >= 0; i--) {
4782 long t1 = 0;
4783 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4784 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
4785 jj < n; jj += MAT_BLK_SZ, panel++) {
4786 long j_max = min(jj+MAT_BLK_SZ, n);
4787 double *row = &M[panel][i*MAT_BLK_SZ];
4788 for (long j = jj; j < j_max; j++) {
4789 long t0 = rem((unsigned long)(long)row[j-jj], p, red_struct);
4790 long t2 = MulMod(rep(X[j]), t0, p);
4791 t1 = AddMod(t1, t2, p);
4792 }
4793 }
4794 X[i].LoopHole() = SubMod(bv[i], t1, p);
4795 }
4796 }
4797
4798 d.LoopHole() = det;
4799
4800 }
4801
4802 #endif
4803
4804
4805 static
4806 void blk_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4807 vec_zz_p *xp, bool trans, bool relax)
4808 {
4809 long n = A.NumRows();
4810
4811 if (A.NumCols() != n)
4812 LogicError("tri: nonsquare matrix");
4813
4814 if (bp && bp->length() != n)
4815 LogicError("tri: dimension mismatch");
4816
4817 if (bp && !xp)
4818 LogicError("tri: bad args");
4819
4820 if (n == 0) {
4821 set(d);
4822 if (xp) xp->SetLength(0);
4823 return;
4824 }
4825
4826 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4827
4828 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4829
4830 Vec< UniqueArray<unsigned long> > M;
4831 M.SetLength(npanels);
4832 for (long panel = 0; panel < npanels; panel++) {
4833 M[panel].SetLength(n*MAT_BLK_SZ);
4834 unsigned long *panelp = &M[panel][0];
4835
4836 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4837 }
4838
4839 if (trans) {
4840 // copy A transposed into panels
4841 for (long i = 0; i < n; i++) {
4842 const zz_p *row = &A[i][0];
4843 unsigned long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
4844 for (long j = 0; j < n; j++)
4845 col[j*MAT_BLK_SZ] = rep(row[j]);
4846 }
4847 }
4848 else {
4849 // copy A into panels
4850 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4851 long j_max = min(jj+MAT_BLK_SZ, n);
4852 unsigned long *panelp = &M[panel][0];
4853
4854 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4855 const zz_p *ap = A[i].elts() + jj;
4856
4857 for (long j = jj; j < j_max; j++)
4858 panelp[j-jj] = rep(ap[j-jj]);
4859 }
4860 }
4861 }
4862
4863 Vec<long> bv;
4864 if (bp) conv(bv, *bp);
4865
4866 Vec<long> P;
4867 P.SetLength(n);
4868 for (long k = 0; k < n; k++) P[k] = k;
4869 // records swap operations
4870
4871
4872 long det;
4873 det = 1;
4874
4875 long p = zz_p::modulus();
4876 mulmod_t pinv = zz_p::ModulusInverse();
4877 sp_reduce_struct red_struct = zz_p::red_struct();
4878
4879
4880 bool pivoting = false;
4881
4882 unsigned long ured_trigger =
4883 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
4884 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
4885
4886 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
4887
4888 long red_count = red_trigger;
4889
4890 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4891 long k_max = min(kk+MAT_BLK_SZ, n);
4892
4893 bool cleanup = false;
4894
4895 if (red_count-MAT_BLK_SZ < 0) {
4896 red_count = red_trigger;
4897 cleanup = true;
4898 }
4899
4900 red_count = red_count-MAT_BLK_SZ;
4901 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
4902
4903 if (cleanup) {
4904 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
4905 kpanelp[r] = rem(kpanelp[r], p, red_struct);
4906 }
4907
4908 for (long k = kk; k < k_max; k++) {
4909
4910 long pos = -1;
4911 long pivot;
4912 long pivot_inv;
4913
4914 for (long i = k; i < n; i++) {
4915 // NOTE: by using InvModStatus, this code will work
4916 // for prime-powers as well as primes
4917 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4918 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4919 pos = i;
4920 break;
4921 }
4922 }
4923
4924 if (pos == -1) {
4925 clear(d);
4926 return;
4927 }
4928
4929 unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
4930 if (k != pos) {
4931 // swap rows pos and k
4932 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
4933 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4934
4935 det = NegateMod(det, p);
4936 P[k] = pos;
4937 pivoting = true;
4938
4939 if (bp) _ntl_swap(bv[pos], bv[k]);
4940 }
4941
4942 det = MulMod(det, pivot, p);
4943
4944 {
4945 // multiply row k by pivot_inv
4946 long t1 = pivot_inv;
4947 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4948 for (long j = 0; j < MAT_BLK_SZ; j++) {
4949 long t2 = rem(y[j], p, red_struct);
4950 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4951 }
4952
4953 y[k-kk] = pivot_inv;
4954
4955 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
4956 }
4957
4958 for (long i = kk; i < n; i++) {
4959 if (i == k) continue; // skip row k
4960
4961 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
4962 long t1 = rem(x[k-kk], p, red_struct);
4963 t1 = NegateMod(t1, p);
4964 x[k-kk] = 0;
4965 if (t1 == 0) continue;
4966
4967 // add t1 * row k to row i
4968 unsigned long ut1 = t1;
4969 muladd_interval(x, y, ut1, MAT_BLK_SZ);
4970 if (bp)
4971 {
4972 long t2 = MulMod(bv[k], t1, p);
4973 bv[i] = AddMod(bv[i], t2, p);
4974 }
4975 }
4976 }
4977
4978
4979 // finished processing current kpanel
4980 // next, reduce and apply to all other kpanels
4981
4982 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
4983 kpanelp[r] = rem(kpanelp[r], p, red_struct);
4984
4985 // special processing: subtract 1 off of diangonal
4986
4987 for (long k = kk; k < k_max; k++)
4988 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4989
4990
4991 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
4992 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
4993 NTL_IMPORT(p)
4994 NTL_IMPORT(n)
4995 NTL_IMPORT(red_struct)
4996 NTL_IMPORT(kpanel)
4997 NTL_IMPORT(kpanelp)
4998 NTL_IMPORT(kk)
4999 NTL_IMPORT(k_max)
5000
5001
5002 UniqueArray<unsigned long> buf_store;
5003 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5004 unsigned long *buf = &buf_store[0];
5005
5006 for (long index = first; index < last; index++) {
5007 long jpanel = index + kpanel+1;
5008
5009 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
5010
5011 if (cleanup) {
5012 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5013 jpanelp[r] = rem(jpanelp[r], p, red_struct);
5014 }
5015
5016 // perform swaps
5017 for (long k = kk; k < k_max; k++) {
5018 long pos = P[k];
5019 if (pos != k) {
5020 // swap rows pos and k
5021 unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5022 unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
5023 for (long j = 0; j < MAT_BLK_SZ; j++)
5024 _ntl_swap(pos_p[j], k_p[j]);
5025 }
5026 }
5027
5028 // copy block number kpanel (the one on the diagonal) into buf
5029 // here, we transpose it
5030
5031 for (long k = kk; k < k_max; k++)
5032 for (long j = 0; j < MAT_BLK_SZ; j++)
5033 buf[j*MAT_BLK_SZ + (k-kk)] =
5034 rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
5035
5036 // jpanel += kpanel*buf
5037
5038 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
5039 }
5040
5041 NTL_GEXEC_RANGE_END
5042
5043 // special processing: add 1 back to the diangonal
5044
5045 for (long k = kk; k < k_max; k++)
5046 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5047
5048 }
5049
5050 if (bp) {
5051 xp->SetLength(n);
5052 zz_p *X = xp->elts();
5053
5054 for (long i = n-1; i >= 0; i--) {
5055 long t1 = 0;
5056 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5057 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
5058 jj < n; jj += MAT_BLK_SZ, panel++) {
5059 long j_max = min(jj+MAT_BLK_SZ, n);
5060 unsigned long *row = &M[panel][i*MAT_BLK_SZ];
5061 for (long j = jj; j < j_max; j++) {
5062 long t0 = rem(row[j-jj], p, red_struct);
5063 long t2 = MulMod(rep(X[j]), t0, p);
5064 t1 = AddMod(t1, t2, p);
5065 }
5066 }
5067 X[i].LoopHole() = SubMod(bv[i], t1, p);
5068 }
5069 }
5070
5071 d.LoopHole() = det;
5072
5073 }
5074
5075
5076 static
5077 void blk_tri_LL(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5078 vec_zz_p *xp, bool trans, bool relax)
5079 {
5080 long n = A.NumRows();
5081
5082 if (A.NumCols() != n)
5083 LogicError("tri: nonsquare matrix");
5084
5085 if (bp && bp->length() != n)
5086 LogicError("tri: dimension mismatch");
5087
5088 if (bp && !xp)
5089 LogicError("tri: bad args");
5090
5091 if (n == 0) {
5092 set(d);
5093 if (xp) xp->SetLength(0);
5094 return;
5095 }
5096
5097 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5098
5099 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5100
5101 Vec< UniqueArray<long> > M;
5102 M.SetLength(npanels);
5103 for (long panel = 0; panel < npanels; panel++) {
5104 M[panel].SetLength(n*MAT_BLK_SZ);
5105 long *panelp = &M[panel][0];
5106
5107 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
5108 }
5109
5110 if (trans) {
5111 // copy A transposed into panels
5112 for (long i = 0; i < n; i++) {
5113 const zz_p *row = &A[i][0];
5114 long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
5115 for (long j = 0; j < n; j++)
5116 col[j*MAT_BLK_SZ] = rep(row[j]);
5117 }
5118 }
5119 else {
5120 // copy A into panels
5121 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
5122 long j_max = min(jj+MAT_BLK_SZ, n);
5123 long *panelp = &M[panel][0];
5124
5125 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5126 const zz_p *ap = A[i].elts() + jj;
5127
5128 for (long j = jj; j < j_max; j++)
5129 panelp[j-jj] = rep(ap[j-jj]);
5130 }
5131 }
5132 }
5133
5134 Vec<long> bv;
5135 if (bp) conv(bv, *bp);
5136
5137 Vec<long> P;
5138 P.SetLength(n);
5139 for (long k = 0; k < n; k++) P[k] = k;
5140 // records swap operations
5141
5142
5143 long det;
5144 det = 1;
5145
5146 long p = zz_p::modulus();
5147 mulmod_t pinv = zz_p::ModulusInverse();
5148 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
5149
5150
5151 bool pivoting = false;
5152
5153 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5154 long k_max = min(kk+MAT_BLK_SZ, n);
5155
5156 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
5157
5158 for (long k = kk; k < k_max; k++) {
5159
5160 long pos = -1;
5161 long pivot;
5162 long pivot_inv;
5163
5164 for (long i = k; i < n; i++) {
5165 // NOTE: by using InvModStatus, this code will work
5166 // for prime-powers as well as primes
5167 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
5168 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5169 pos = i;
5170 break;
5171 }
5172 }
5173
5174 if (pos == -1) {
5175 clear(d);
5176 return;
5177 }
5178
5179 long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
5180 if (k != pos) {
5181 // swap rows pos and k
5182 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
5183 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5184
5185 det = NegateMod(det, p);
5186 P[k] = pos;
5187 pivoting = true;
5188
5189 if (bp) _ntl_swap(bv[pos], bv[k]);
5190 }
5191
5192 det = MulMod(det, pivot, p);
5193
5194 {
5195 // multiply row k by pivot_inv
5196 long t1 = pivot_inv;
5197 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5198 for (long j = 0; j < MAT_BLK_SZ; j++) {
5199 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
5200 }
5201
5202 y[k-kk] = pivot_inv;
5203
5204 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5205 }
5206
5207 for (long i = kk; i < n; i++) {
5208 if (i == k) continue; // skip row k
5209
5210 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
5211 long t1 = x[k-kk];
5212 t1 = NegateMod(t1, p);
5213 x[k-kk] = 0;
5214 if (t1 == 0) continue;
5215
5216 // add t1 * row k to row i
5217 long ut1 = t1;
5218 muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
5219 if (bp)
5220 {
5221 long t2 = MulMod(bv[k], t1, p);
5222 bv[i] = AddMod(bv[i], t2, p);
5223 }
5224 }
5225 }
5226
5227
5228 // finished processing current kpanel
5229 // next, reduce and apply to all other kpanels
5230
5231 // special processing: subtract 1 off of diangonal
5232
5233 for (long k = kk; k < k_max; k++)
5234 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5235
5236
5237 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
5238 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5239 NTL_IMPORT(p)
5240 NTL_IMPORT(n)
5241 NTL_IMPORT(ll_red_struct)
5242 NTL_IMPORT(kpanel)
5243 NTL_IMPORT(kpanelp)
5244 NTL_IMPORT(kk)
5245 NTL_IMPORT(k_max)
5246
5247
5248 UniqueArray<long> buf_store;
5249 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5250 long *buf = &buf_store[0];
5251
5252 for (long index = first; index < last; index++) {
5253 long jpanel = index + kpanel+1;
5254
5255 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
5256
5257 // perform swaps
5258 for (long k = kk; k < k_max; k++) {
5259 long pos = P[k];
5260 if (pos != k) {
5261 // swap rows pos and k
5262 long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
5263 long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
5264 for (long j = 0; j < MAT_BLK_SZ; j++)
5265 _ntl_swap(pos_p[j], k_p[j]);
5266 }
5267 }
5268
5269 // copy block number kpanel (the one on the diagonal) into buf
5270 // here, we transpose it
5271
5272 for (long k = kk; k < k_max; k++)
5273 for (long j = 0; j < MAT_BLK_SZ; j++)
5274 buf[j*MAT_BLK_SZ + (k-kk)] = jpanelp[k*MAT_BLK_SZ+j];
5275
5276 // jpanel += kpanel*buf
5277
5278 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
5279 }
5280
5281 NTL_GEXEC_RANGE_END
5282
5283 // special processing: add 1 back to the diangonal
5284
5285 for (long k = kk; k < k_max; k++)
5286 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5287
5288 }
5289
5290 if (bp) {
5291 xp->SetLength(n);
5292 zz_p *X = xp->elts();
5293
5294 for (long i = n-1; i >= 0; i--) {
5295 long t1 = 0;
5296 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5297 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
5298 jj < n; jj += MAT_BLK_SZ, panel++) {
5299 long j_max = min(jj+MAT_BLK_SZ, n);
5300 long *row = &M[panel][i*MAT_BLK_SZ];
5301 for (long j = jj; j < j_max; j++) {
5302 long t0 = row[j-jj];
5303 long t2 = MulMod(rep(X[j]), t0, p);
5304 t1 = AddMod(t1, t2, p);
5305 }
5306 }
5307 X[i].LoopHole() = SubMod(bv[i], t1, p);
5308 }
5309 }
5310
5311 d.LoopHole() = det;
5312
5313 }
5314
5315
5316
5317 #endif
5318
5319
5320
5321 static
5322 void tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5323 vec_zz_p *xp, bool trans, bool relax)
5324 {
5325 long n = A.NumRows();
5326
5327 if (A.NumCols() != n)
5328 LogicError("inv: nonsquare matrix");
5329
5330 if (bp && bp->length() != n)
5331 LogicError("tri: dimension mismatch");
5332
5333 if (bp && !xp)
5334 LogicError("tri: bad args");
5335
5336 #ifndef NTL_HAVE_LL_TYPE
5337
5338 basic_tri(d, A, bp, xp, trans, relax);
5339
5340 #else
5341
5342 long p = zz_p::modulus();
5343
5344 if (n < 16) {
5345 //cerr << "basic_tri\n";
5346 basic_tri(d, A, bp, xp, trans, relax);
5347 }
5348 else if (n/MAT_BLK_SZ < 4) {
5349 long V = 64;
5350
5351 #ifdef NTL_HAVE_AVX
5352 if (p-1 <= MAX_DBL_INT &&
5353 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
5354 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
5355
5356 //cerr << "alt_tri_DD\n";
5357 alt_tri_DD(d, A, bp, xp, trans, relax);
5358 }
5359 else
5360 #endif
5361 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
5362 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
5363
5364 //cerr << "alt_tri_L\n";
5365 alt_tri_L(d, A, bp, xp, trans, relax);
5366
5367 }
5368 else {
5369
5370 //cerr << "basic_tri\n";
5371 basic_tri(d, A, bp, xp, trans, relax);
5372 }
5373 }
5374 else {
5375 long V = 4*MAT_BLK_SZ;
5376
5377 #ifdef NTL_HAVE_AVX
5378 if (p-1 <= MAX_DBL_INT &&
5379 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
5380 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
5381
5382 //cerr << "blk_tri_DD\n";
5383 blk_tri_DD(d, A, bp, xp, trans, relax);
5384 }
5385 else
5386 #endif
5387 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
5388 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
5389
5390 //cerr << "blk_tri_L\n";
5391 blk_tri_L(d, A, bp, xp, trans, relax);
5392
5393 }
5394 else {
5395
5396 //cerr << "blk_tri_LL\n";
5397 blk_tri_LL(d, A, bp, xp, trans, relax);
5398 }
5399
5400 }
5401
5402 #endif
5403
5404
5405
5406 }
5407
5408
5409
5410 void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax)
5411 {
5412 tri(d, A, 0, 0, false, relax);
5413 }
5414
5415
5416 void relaxed_solve(zz_p& d, vec_zz_p& x,
5417 const mat_zz_p& A, const vec_zz_p& b, bool relax)
5418 {
5419 tri(d, A, &b, &x, true, relax);
5420 }
5421
5422 void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax)
5423 {
5424 tri(d, A, &b, &x, false, relax);
5425 }
5426
5427 // ******************************************************************
5428 //
5429 // new image and kernel routines
5430 //
5431 // ******************************************************************
5432
5433
5434 static
5435 long elim_basic(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
5436 long w, bool full)
5437 {
5438 long n = A.NumRows();
5439 long m = A.NumCols();
5440
5441 if (w < 0 || w > m) LogicError("elim: bad args");
5442
5443 // take care of corner cases
5444 if (n == 0) {
5445 if (im) im->SetDims(0, m);
5446 if (ker) ker->SetDims(0, 0);
5447 return 0;
5448 }
5449
5450 if (w == 0) {
5451 if (im) {
5452 if (full)
5453 (*im) = A;
5454 else
5455 im->SetDims(0, m);
5456 }
5457 if (ker) ident(*ker, n);
5458 return 0;
5459 }
5460
5461 Mat<long> M;
5462 conv(M, A);
5463
5464 Vec<long> P;
5465 P.SetLength(n);
5466 for (long k = 0; k < n; k++) P[k] = k;
5467 // records swap operations
5468
5469 Vec<long> pcol;
5470 pcol.SetLength(n);
5471 // pcol[i] records pivot columns for row i
5472
5473 long p = zz_p::modulus();
5474 mulmod_t pinv = zz_p::ModulusInverse();
5475
5476 bool pivoting = false;
5477
5478 long r = 0;
5479
5480 for (long k = 0; k < w; k++) {
5481 long pos = -1;
5482 long pivot_inv;
5483 for (long i = r; i < n; i++) {
5484 long pivot = M[i][k];
5485 if (pivot != 0) {
5486 pivot_inv = InvMod(pivot, p);
5487 pos = i;
5488 break;
5489 }
5490 }
5491
5492 if (pos == -1)
5493 continue;
5494
5495 if (r != pos) {
5496 swap(M[pos], M[r]);
5497 P[r] = pos;
5498 pivoting = true;
5499 }
5500
5501 bool seq = double(n-r)*double(m-k) < PAR_THRESH;
5502
5503 NTL_GEXEC_RANGE(seq, n-(r+1), first, last)
5504 NTL_IMPORT(p)
5505 NTL_IMPORT(n)
5506 NTL_IMPORT(k)
5507 NTL_IMPORT(r)
5508 long * NTL_RESTRICT y = &M[r][0];
5509
5510 for (long ii = first; ii < last; ii++) {
5511 long i = ii + r+1;
5512
5513 long * NTL_RESTRICT x = &M[i][0];
5514 long t1 = x[k];
5515 t1 = MulMod(t1, pivot_inv, p);
5516 t1 = NegateMod(t1, p);
5517 x[k] = t1;
5518 if (t1 == 0) continue;
5519
5520 // add t1 * row r to row i
5521 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5522
5523 for (long j = k+1; j < m; j++) {
5524 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
5525 x[j] = AddMod(x[j], t2, p);
5526 }
5527 }
5528 NTL_GEXEC_RANGE_END
5529
5530 pcol[r] = k;
5531 r++;
5532 }
5533
5534 if (im) {
5535 mat_zz_p& Im = *im;;
5536 if (full)
5537 Im.SetDims(n, m);
5538 else
5539 Im.SetDims(r, m);
5540
5541 for (long i = 0; i < r; i++) {
5542 long pc = pcol[i];
5543 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
5544 for (long j = pc; j < m; j++) Im[i][j].LoopHole() = M[i][j];
5545 }
5546
5547 if (full) {
5548 for (long i = r; i < n; i++) {
5549 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
5550 for (long j = w; j < m; j++) Im[i][j].LoopHole() = M[i][j];
5551 }
5552 }
5553 }
5554
5555 if (ker) {
5556
5557 if (n == r) {
5558 mat_zz_p& Ker = *ker;
5559 Ker.SetDims(n-r, n);
5560 }
5561 else {
5562 Mat<long> colbuf;
5563 colbuf.SetDims(r, n);
5564
5565 for (long k = 0; k < r; k++) {
5566 long pc = pcol[k];
5567 for (long i = k+1; i < n; i++) colbuf[k][i] = M[i][pc];
5568 }
5569
5570 M.kill();
5571
5572 Mat<long> X;
5573 X.SetDims(n-r, r);
5574
5575 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
5576 NTL_GEXEC_RANGE(seq, n-r, first, last)
5577 NTL_IMPORT(p)
5578 NTL_IMPORT(r)
5579
5580 for (long i = first; i < last; i++) {
5581 long *Xi = &X[i][0];
5582
5583 for (long k = r-1; k >= 0; k--) {
5584 long *cvecp = &colbuf[k][0];
5585
5586 long acc = cvecp[i+r];
5587 for (long j = k+1; j < r; j++) {
5588 acc = AddMod( acc, MulMod(Xi[j], cvecp[j], p), p );
5589 }
5590 Xi[k] = acc;
5591 }
5592
5593 }
5594
5595 NTL_GEXEC_RANGE_END
5596
5597 mat_zz_p& Ker = *ker;
5598 Ker.SetDims(n-r, n);
5599 for (long i = 0; i < n-r; i++) {
5600 for (long j = 0; j < r; j++) Ker[i][j].LoopHole() = X[i][j];
5601 for (long j = r; j < n; j++) Ker[i][j].LoopHole() = 0;
5602 Ker[i][r+i].LoopHole() = 1;
5603 }
5604
5605 if (pivoting) {
5606 for (long i = 0; i < n-r; i++) {
5607 zz_p *x = Ker[i].elts();
5608
5609 for (long k = n-1; k >= 0; k--) {
5610 long pos = P[k];
5611 if (pos != k) swap(x[pos], x[k]);
5612 }
5613 }
5614 }
5615 }
5616 }
5617
5618 return r;
5619 }
5620
5621 #ifdef NTL_HAVE_LL_TYPE
5622
5623
5624 #ifdef NTL_HAVE_AVX
5625
5626
5627 static inline
5628 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk, long src_limit)
5629 {
5630 long src_row = src_blk*MAT_BLK_SZ;
5631 long dst_row = dst_blk*MAT_BLK_SZ;
5632
5633 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
5634
5635 for (long i = 0; i < nrows; i++)
5636 for (long j = 0; j < MAT_BLK_SZ; j++)
5637 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
5638
5639 for (long i = nrows; i < MAT_BLK_SZ; i++)
5640 for (long j = 0; j < MAT_BLK_SZ; j++)
5641 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
5642
5643 }
5644
5645 static inline
5646 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk)
5647 {
5648 long src_row = src_blk*MAT_BLK_SZ;
5649 long dst_row = dst_blk*MAT_BLK_SZ;
5650
5651 long nrows = MAT_BLK_SZ;
5652
5653 for (long i = 0; i < nrows; i++)
5654 for (long j = 0; j < MAT_BLK_SZ; j++)
5655 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
5656 }
5657
5658 static inline
5659 void SwapOneRow(double *panelp, long i, long pos)
5660 {
5661 double * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
5662 double * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
5663 for (long j = 0; j < MAT_BLK_SZ; j++)
5664 _ntl_swap(pos_p[j], i_p[j]);
5665 }
5666
5667 static inline
5668 void ApplySwaps(double *panelp, long start, long end, const Vec<long>& P)
5669 {
5670 for (long i = start; i < end; i++) {
5671 long pos = P[i];
5672 if (pos != i)
5673 SwapOneRow(panelp, i, pos);
5674 }
5675 }
5676
5677
5678 static inline
5679 void MulAddBlock(double *x, const double *y, const double *z)
5680 {
5681 // x += y*z
5682 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
5683 }
5684
5685
5686 static
5687 long elim_blk_DD(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
5688 long w, bool full)
5689 {
5690 long n = A.NumRows();
5691 long m = A.NumCols();
5692
5693 if (w < 0 || w > m) LogicError("elim: bad args");
5694
5695 // take care of corner cases
5696 if (n == 0) {
5697 if (im) im->SetDims(0, m);
5698 if (ker) ker->SetDims(0, 0);
5699 return 0;
5700 }
5701
5702 if (w == 0) {
5703 if (im) {
5704 if (full)
5705 (*im) = A;
5706 else
5707 im->SetDims(0, m);
5708 }
5709 if (ker) ident(*ker, n);
5710 return 0;
5711 }
5712
5713 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5714 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5715
5716 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5717
5718
5719 Vec< AlignedArray<double> > M;
5720 M.SetLength(npanels);
5721 for (long panel = 0; panel < npanels; panel++) {
5722 M[panel].SetLength(n*MAT_BLK_SZ);
5723 double *panelp = &M[panel][0];
5724
5725 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
5726 }
5727
5728 // copy A into panels
5729 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
5730 long j_max = min(jj+MAT_BLK_SZ, m);
5731 double *panelp = &M[panel][0];
5732
5733 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5734 const zz_p *ap = A[i].elts() + jj;
5735
5736 for (long j = jj; j < j_max; j++)
5737 panelp[j-jj] = rep(ap[j-jj]);
5738 }
5739 }
5740
5741 AlignedArray<double> aux_panel_store;
5742 aux_panel_store.SetLength(n*MAT_BLK_SZ);
5743 double * NTL_RESTRICT aux_panel = &aux_panel_store[0];
5744
5745
5746 AlignedArray<double> buf_store1;
5747 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5748 double *buf1 = &buf_store1[0];
5749
5750 Vec<long> P;
5751 P.SetLength(n);
5752 for (long k = 0; k < n; k++) P[k] = k;
5753 // records swap operations
5754
5755 Vec<long> pcol;
5756 pcol.SetLength(n);
5757 // pcol[i] records pivot columns for row i
5758
5759 long p = zz_p::modulus();
5760 mulmod_t pinv = zz_p::ModulusInverse();
5761 sp_reduce_struct red_struct = zz_p::red_struct();
5762
5763 bool pivoting = false;
5764
5765 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
5766 long red_count = red_trigger;
5767
5768 long r = 0, rr = 0, k = 0, kk = 0;
5769 long rpanel = 0, kpanel = 0;
5770
5771 while (k < w) {
5772
5773 if (r > rr && ker) {
5774 // we have a panel from a previous iteration
5775 // we store enough of it to facilitate the kernel
5776 // computation later. At this point, we have
5777 // r == rr+INV_BLK_SIZE, and it suffices to store
5778 // rows [r..n) into M[rpanel], and this will not
5779 // overwrite anything useful in M[rpanel]
5780
5781 double *panelp = &M[rpanel][0];
5782 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
5783 panelp[h] = aux_panel[h];
5784 }
5785
5786 rpanel++;
5787 }
5788
5789 rr = r;
5790
5791 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
5792
5793 bool cleanup = false;
5794
5795 if (red_count-MAT_BLK_SZ < 0) {
5796 red_count = red_trigger;
5797 cleanup = true;
5798 }
5799
5800 red_count = red_count-MAT_BLK_SZ;
5801
5802 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
5803
5804 if (k == kk+MAT_BLK_SZ) { // start new kpanel
5805 kk = k;
5806 kpanel++;
5807 }
5808
5809 double * NTL_RESTRICT kpanelp = &M[kpanel][0];
5810
5811 if (k == kk) { // a fresh kpanel -- special processing
5812
5813 if (cleanup) {
5814 for (long h = 0; h < n*MAT_BLK_SZ; h++)
5815 kpanelp[h] = rem((unsigned long)(long)kpanelp[h], p, red_struct);
5816 }
5817
5818 if (r > rr) {
5819
5820
5821 // apply current sequence of permutations
5822
5823 ApplySwaps(kpanelp, rr, r, P);
5824
5825 // clean aux_panel
5826 for (long h = 0; h < n*MAT_BLK_SZ; h++)
5827 aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
5828
5829 // copy rows [rr..r) of kpanel into buf1
5830 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
5831 buf1[i] = rem((unsigned long)(long)kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
5832
5833 // kpanel[rr..n) += aux_panel[rr..n)*buf1
5834
5835 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
5836 }
5837 }
5838
5839 long pos = -1;
5840 long pivot;
5841 long pivot_inv;
5842 for (long i = r; i < n; i++) {
5843 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
5844 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
5845
5846 if (pivot != 0) {
5847 pivot_inv = InvMod(pivot, p);
5848 pos = i;
5849 break;
5850 }
5851 }
5852
5853 if (pos == -1) {
5854 continue;
5855 }
5856
5857 double * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
5858 double * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
5859 if (r != pos) {
5860 // swap rows pos and r
5861 double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
5862 double * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
5863
5864 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5865 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
5866
5867 P[r] = pos;
5868 pivoting = true;
5869 }
5870
5871 // clean up row r of kpanel and aux_panel
5872 for (long j = k-kk; j < MAT_BLK_SZ; j++)
5873 y[j] = rem((unsigned long)(long)y[j], p, red_struct);
5874 for (long j = 0; j < r-rr; j++)
5875 y1[j] = rem((unsigned long)(long)y1[j], p, red_struct);
5876
5877 // clear column
5878 for (long i = r+1; i < n; i++) {
5879 double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
5880 double * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
5881 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
5882 t1 = MulMod(t1, pivot_inv, p);
5883 t1 = NegateMod(t1, p);
5884 x[k-kk] = 0;
5885 x1[r-rr] = t1;
5886 if (t1 == 0) continue;
5887
5888 // add t1 * row r to row i
5889 double ut1 = t1;
5890
5891 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
5892 x[j] += y[j]*ut1;
5893 for (long j = 0; j < r-rr; j++)
5894 x1[j] += y1[j]*ut1;
5895 }
5896
5897 pcol[r] = k;
5898 r++;
5899 }
5900
5901 if (r > rr) {
5902
5903 // we have a panel
5904
5905 // clean it up
5906 for (long h = 0; h < n*MAT_BLK_SZ; h++)
5907 aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
5908
5909 bool seq =
5910 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
5911
5912 // apply aux_panel to remaining panels: [kpanel+1..npanels)
5913 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5914 NTL_IMPORT(p)
5915 NTL_IMPORT(n)
5916 NTL_IMPORT(red_struct)
5917 NTL_IMPORT(aux_panel)
5918 NTL_IMPORT(rr)
5919 NTL_IMPORT(r)
5920
5921
5922 AlignedArray<double> buf_store;
5923 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5924 double *buf = &buf_store[0];
5925
5926
5927 for (long index = first; index < last; index++) {
5928 long jpanel = index + kpanel+1;
5929
5930 double * NTL_RESTRICT jpanelp = &M[jpanel][0];
5931
5932 if (cleanup) {
5933 for (long h = 0; h < n*MAT_BLK_SZ; h++)
5934 jpanelp[h] = rem((unsigned long)(long)jpanelp[h], p, red_struct);
5935 }
5936
5937 // perform swaps
5938 ApplySwaps(jpanelp, rr, r, P);
5939
5940 // copy rows [rr..r) of jpanel into buf
5941 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
5942 buf[i] = rem((unsigned long)(long)jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
5943
5944 // jpanel[rr..n) += aux_panel[rr..n)*buf
5945
5946 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
5947 }
5948
5949 NTL_GEXEC_RANGE_END
5950
5951 }
5952
5953 }
5954
5955 if (im) {
5956 mat_zz_p& Im = *im;;
5957 if (full)
5958 Im.SetDims(n, m);
5959 else
5960 Im.SetDims(r, m);
5961
5962 for (long i = 0; i < r; i++) {
5963 long pc = pcol[i];
5964 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
5965 for (long j = pc; j < m; j++) {
5966 double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
5967 Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
5968 }
5969 }
5970
5971 if (full) {
5972 for (long i = r; i < n; i++) {
5973 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
5974 for (long j = w; j < m; j++) {
5975 double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
5976 Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
5977 }
5978 }
5979 }
5980 }
5981
5982 if (ker) {
5983 mat_zz_p& Ker = *ker;
5984 Ker.SetDims(n-r, n);
5985 if (r < n) {
5986
5987 long start_block = r/MAT_BLK_SZ;
5988 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5989 long vblocks = end_block-start_block;
5990 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5991
5992 Vec< AlignedArray<double> > kerbuf;
5993 kerbuf.SetLength(vblocks);
5994 for (long i = 0; i < vblocks; i++)
5995 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
5996
5997 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5998
5999 // if r > rr, we have a panel sitting in
6000 // aux_panel, which may or may not be a full panel
6001
6002 double *initial_panel = 0;
6003 if (r > rr) {
6004 initial_panel = aux_panel;
6005 }
6006 else {
6007 initial_panel = &M[hblocks-1][0];
6008 }
6009
6010 for (long vb = start_block; vb < end_block; vb++)
6011 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
6012
6013 for (long hb = hblocks-2; hb >= 0; hb--) {
6014
6015 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
6016
6017 for (long b = hb+1; b < end_block; b++)
6018 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
6019 }
6020
6021 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6022
6023
6024 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
6025 NTL_IMPORT(p)
6026 NTL_IMPORT(red_struct)
6027 NTL_IMPORT(hblocks)
6028
6029 for (long index = first; index < last; index++) {
6030 long vb = index + start_block;
6031 double *kerbufp = &kerbuf[vb-start_block][0];
6032
6033 for (long hb = hblocks-2; hb >= 0; hb--) {
6034 double *colbuf = &M[hb][0];
6035 double *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
6036
6037 CopyBlock(acc, 0, colbuf, vb-1);
6038
6039 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
6040 long red_count = red_trigger;
6041
6042 for (long b = hb+1; b < hblocks; b++) {
6043
6044 if (red_count-MAT_BLK_SZ < 0) {
6045 red_count = red_trigger;
6046 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
6047 acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
6048
6049 }
6050 red_count = red_count-MAT_BLK_SZ;
6051
6052 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
6053 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
6054 }
6055
6056 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
6057 acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
6058 }
6059 }
6060
6061 NTL_GEXEC_RANGE_END
6062
6063 for (long i = r; i < n; i++) {
6064
6065 double *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
6066
6067 for (long j = 0; j < r; j++) {
6068 double t0 =
6069 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
6070 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6071
6072 Ker[i-r][j].LoopHole() = long(t0);
6073 }
6074 }
6075
6076 for (long i = 0; i < n-r; i++) {
6077 for (long j = 0; j < n-r; j++) {
6078 Ker[i][j+r].LoopHole() = 0;
6079 }
6080 Ker[i][i+r].LoopHole() = 1;
6081 }
6082
6083 if (pivoting) {
6084 for (long i = 0; i < n-r; i++) {
6085 zz_p *x = Ker[i].elts();
6086
6087 for (long k = n-1; k >= 0; k--) {
6088 long pos = P[k];
6089 if (pos != k) swap(x[pos], x[k]);
6090 }
6091 }
6092 }
6093 }
6094 }
6095
6096 return r;
6097
6098 }
6099
6100 #endif
6101
6102
6103
6104 static inline
6105 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk, long src_limit)
6106 {
6107 long src_row = src_blk*MAT_BLK_SZ;
6108 long dst_row = dst_blk*MAT_BLK_SZ;
6109
6110 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
6111
6112 for (long i = 0; i < nrows; i++)
6113 for (long j = 0; j < MAT_BLK_SZ; j++)
6114 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6115
6116 for (long i = nrows; i < MAT_BLK_SZ; i++)
6117 for (long j = 0; j < MAT_BLK_SZ; j++)
6118 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
6119
6120 }
6121
6122 static inline
6123 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk)
6124 {
6125 long src_row = src_blk*MAT_BLK_SZ;
6126 long dst_row = dst_blk*MAT_BLK_SZ;
6127
6128 long nrows = MAT_BLK_SZ;
6129
6130 for (long i = 0; i < nrows; i++)
6131 for (long j = 0; j < MAT_BLK_SZ; j++)
6132 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6133 }
6134
6135 static inline
6136 void TransposeBlock(unsigned long *dst_ptr, long dst_blk)
6137 {
6138 dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
6139
6140 for (long i = 0; i < MAT_BLK_SZ; i++)
6141 for (long j = 0; j < i; j++)
6142 _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
6143 }
6144
6145 static inline
6146 void SwapOneRow(unsigned long *panelp, long i, long pos)
6147 {
6148 unsigned long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6149 unsigned long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
6150 for (long j = 0; j < MAT_BLK_SZ; j++)
6151 _ntl_swap(pos_p[j], i_p[j]);
6152 }
6153
6154 static inline
6155 void ApplySwaps(unsigned long *panelp, long start, long end, const Vec<long>& P)
6156 {
6157 for (long i = start; i < end; i++) {
6158 long pos = P[i];
6159 if (pos != i)
6160 SwapOneRow(panelp, i, pos);
6161 }
6162 }
6163
6164
6165 static inline
6166 void MulAddBlock(unsigned long *x, const unsigned long *y, const unsigned long *z)
6167 {
6168 // x += y*z
6169
6170 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
6171 }
6172
6173
6174 static
6175 long elim_blk_L(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6176 long w, bool full)
6177 {
6178 long n = A.NumRows();
6179 long m = A.NumCols();
6180
6181 if (w < 0 || w > m) LogicError("elim: bad args");
6182
6183 // take care of corner cases
6184 if (n == 0) {
6185 if (im) im->SetDims(0, m);
6186 if (ker) ker->SetDims(0, 0);
6187 return 0;
6188 }
6189
6190 if (w == 0) {
6191 if (im) {
6192 if (full)
6193 (*im) = A;
6194 else
6195 im->SetDims(0, m);
6196 }
6197 if (ker) ident(*ker, n);
6198 return 0;
6199 }
6200
6201 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6202 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6203
6204 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6205
6206
6207 Vec< UniqueArray<unsigned long> > M;
6208 M.SetLength(npanels);
6209 for (long panel = 0; panel < npanels; panel++) {
6210 M[panel].SetLength(n*MAT_BLK_SZ);
6211 unsigned long *panelp = &M[panel][0];
6212
6213 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
6214 }
6215
6216 // copy A into panels
6217 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
6218 long j_max = min(jj+MAT_BLK_SZ, m);
6219 unsigned long *panelp = &M[panel][0];
6220
6221 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6222 const zz_p *ap = A[i].elts() + jj;
6223
6224 for (long j = jj; j < j_max; j++)
6225 panelp[j-jj] = rep(ap[j-jj]);
6226 }
6227 }
6228
6229 UniqueArray<unsigned long> aux_panel_store;
6230 aux_panel_store.SetLength(n*MAT_BLK_SZ);
6231 unsigned long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
6232
6233
6234 UniqueArray<unsigned long> buf_store1;
6235 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6236 unsigned long *buf1 = &buf_store1[0];
6237
6238 Vec<long> P;
6239 P.SetLength(n);
6240 for (long k = 0; k < n; k++) P[k] = k;
6241 // records swap operations
6242
6243 Vec<long> pcol;
6244 pcol.SetLength(n);
6245 // pcol[i] records pivot columns for row i
6246
6247 long p = zz_p::modulus();
6248 mulmod_t pinv = zz_p::ModulusInverse();
6249 sp_reduce_struct red_struct = zz_p::red_struct();
6250
6251 bool pivoting = false;
6252
6253 unsigned long ured_trigger =
6254 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
6255 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
6256
6257 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
6258
6259 long red_count = red_trigger;
6260
6261 long r = 0, rr = 0, k = 0, kk = 0;
6262 long rpanel = 0, kpanel = 0;
6263
6264 while (k < w) {
6265
6266 if (r > rr && ker) {
6267 // we have a panel from a previous iteration
6268 // we store enough of it to facilitate the kernel
6269 // computation later. At this point, we have
6270 // r == rr+INV_BLK_SIZE, and it suffices to store
6271 // rows [r..n) into M[rpanel], and this will not
6272 // overwrite anything useful in M[rpanel]
6273
6274 unsigned long *panelp = &M[rpanel][0];
6275 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
6276 panelp[h] = aux_panel[h];
6277 }
6278
6279 rpanel++;
6280 }
6281
6282 rr = r;
6283
6284 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
6285
6286 bool cleanup = false;
6287
6288 if (red_count-MAT_BLK_SZ < 0) {
6289 red_count = red_trigger;
6290 cleanup = true;
6291 }
6292
6293 red_count = red_count-MAT_BLK_SZ;
6294
6295 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
6296
6297 if (k == kk+MAT_BLK_SZ) { // start new kpanel
6298 kk = k;
6299 kpanel++;
6300 }
6301
6302 unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
6303
6304 if (k == kk) { // a fresh kpanel -- special processing
6305
6306 if (cleanup) {
6307 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6308 kpanelp[h] = rem(kpanelp[h], p, red_struct);
6309 }
6310
6311 if (r > rr) {
6312
6313
6314 // apply current sequence of permutations
6315
6316 ApplySwaps(kpanelp, rr, r, P);
6317
6318 // clean aux_panel
6319 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6320 aux_panel[h] = rem(aux_panel[h], p, red_struct);
6321
6322 // copy rows [rr..r) of kpanel into buf1
6323 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6324 buf1[i] = rem(kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6325
6326 TransposeBlock(buf1, 0);
6327
6328 // kpanel[rr..n) += aux_panel[rr..n)*buf1
6329
6330 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
6331 }
6332 }
6333
6334 long pos = -1;
6335 long pivot;
6336 long pivot_inv;
6337 for (long i = r; i < n; i++) {
6338 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
6339 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
6340
6341 if (pivot != 0) {
6342 pivot_inv = InvMod(pivot, p);
6343 pos = i;
6344 break;
6345 }
6346 }
6347
6348 if (pos == -1) {
6349 continue;
6350 }
6351
6352 unsigned long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
6353 unsigned long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
6354 if (r != pos) {
6355 // swap rows pos and r
6356 unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
6357 unsigned long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
6358
6359 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6360 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
6361
6362 P[r] = pos;
6363 pivoting = true;
6364 }
6365
6366 // clean up row r of kpanel and aux_panel
6367 for (long j = k-kk; j < MAT_BLK_SZ; j++)
6368 y[j] = rem(y[j], p, red_struct);
6369 for (long j = 0; j < r-rr; j++)
6370 y1[j] = rem(y1[j], p, red_struct);
6371
6372 // clear column
6373 for (long i = r+1; i < n; i++) {
6374 unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
6375 unsigned long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
6376 long t1 = rem(x[k-kk], p, red_struct);
6377 t1 = MulMod(t1, pivot_inv, p);
6378 t1 = NegateMod(t1, p);
6379 x[k-kk] = 0;
6380 x1[r-rr] = t1;
6381 if (t1 == 0) continue;
6382
6383 // add t1 * row r to row i
6384 unsigned long ut1 = t1;
6385
6386 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
6387 x[j] += y[j]*ut1;
6388 for (long j = 0; j < r-rr; j++)
6389 x1[j] += y1[j]*ut1;
6390 }
6391
6392 pcol[r] = k;
6393 r++;
6394 }
6395
6396 if (r > rr) {
6397
6398 // we have a panel
6399
6400 // clean it up
6401 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6402 aux_panel[h] = rem(aux_panel[h], p, red_struct);
6403
6404 bool seq =
6405 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
6406
6407 // apply aux_panel to remaining panels: [kpanel+1..npanels)
6408 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6409 NTL_IMPORT(p)
6410 NTL_IMPORT(n)
6411 NTL_IMPORT(red_struct)
6412 NTL_IMPORT(aux_panel)
6413 NTL_IMPORT(rr)
6414 NTL_IMPORT(r)
6415
6416
6417 UniqueArray<unsigned long> buf_store;
6418 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6419 unsigned long *buf = &buf_store[0];
6420
6421
6422 for (long index = first; index < last; index++) {
6423 long jpanel = index + kpanel+1;
6424
6425 unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
6426
6427 if (cleanup) {
6428 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6429 jpanelp[h] = rem(jpanelp[h], p, red_struct);
6430 }
6431
6432 // perform swaps
6433 ApplySwaps(jpanelp, rr, r, P);
6434
6435 // copy rows [rr..r) of jpanel into buf
6436 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6437 buf[i] = rem(jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6438
6439 TransposeBlock(buf, 0);
6440
6441 // jpanel[rr..n) += aux_panel[rr..n)*buf
6442
6443 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
6444 }
6445
6446 NTL_GEXEC_RANGE_END
6447
6448 }
6449
6450 }
6451
6452 if (im) {
6453 mat_zz_p& Im = *im;;
6454 if (full)
6455 Im.SetDims(n, m);
6456 else
6457 Im.SetDims(r, m);
6458
6459 for (long i = 0; i < r; i++) {
6460 long pc = pcol[i];
6461 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6462 for (long j = pc; j < m; j++) {
6463 unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6464 Im[i][j].LoopHole() = rem(t0, p, red_struct);
6465 }
6466 }
6467
6468 if (full) {
6469 for (long i = r; i < n; i++) {
6470 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6471 for (long j = w; j < m; j++) {
6472 unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6473 Im[i][j].LoopHole() = rem(t0, p, red_struct);
6474 }
6475 }
6476 }
6477 }
6478
6479 if (ker) {
6480 mat_zz_p& Ker = *ker;
6481 Ker.SetDims(n-r, n);
6482 if (r < n) {
6483
6484 long start_block = r/MAT_BLK_SZ;
6485 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6486 long vblocks = end_block-start_block;
6487 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6488
6489 Vec< UniqueArray<unsigned long> > kerbuf;
6490 kerbuf.SetLength(vblocks);
6491 for (long i = 0; i < vblocks; i++)
6492 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
6493
6494 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6495
6496 // if r > rr, we have a panel sitting in
6497 // aux_panel, which may or may not be a full panel
6498
6499 unsigned long *initial_panel = 0;
6500 if (r > rr) {
6501 initial_panel = aux_panel;
6502 }
6503 else {
6504 initial_panel = &M[hblocks-1][0];
6505 }
6506
6507 for (long vb = start_block; vb < end_block; vb++)
6508 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
6509
6510 for (long hb = hblocks-2; hb >= 0; hb--) {
6511
6512 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
6513
6514 for (long b = hb+1; b < end_block; b++) {
6515 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
6516 TransposeBlock(&M[hb][0], b-1);
6517 }
6518 }
6519
6520 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6521
6522
6523 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
6524 NTL_IMPORT(p)
6525 NTL_IMPORT(red_struct)
6526 NTL_IMPORT(hblocks)
6527
6528 for (long index = first; index < last; index++) {
6529 long vb = index + start_block;
6530 unsigned long *kerbufp = &kerbuf[vb-start_block][0];
6531
6532 for (long hb = hblocks-2; hb >= 0; hb--) {
6533 unsigned long *colbuf = &M[hb][0];
6534 unsigned long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
6535
6536 CopyBlock(acc, 0, colbuf, vb-1);
6537 TransposeBlock(acc, 0);
6538
6539
6540 unsigned long ured_trigger =
6541 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
6542 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
6543
6544 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
6545 long red_count = red_trigger;
6546
6547 for (long b = hb+1; b < hblocks; b++) {
6548
6549 if (red_count-MAT_BLK_SZ < 0) {
6550 red_count = red_trigger;
6551 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
6552 acc[h] = rem(acc[h], p, red_struct);
6553
6554 }
6555 red_count = red_count-MAT_BLK_SZ;
6556
6557 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
6558 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
6559 }
6560
6561 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
6562 acc[h] = rem(acc[h], p, red_struct);
6563 }
6564 }
6565
6566 NTL_GEXEC_RANGE_END
6567
6568 for (long i = r; i < n; i++) {
6569
6570 unsigned long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
6571
6572 for (long j = 0; j < r; j++) {
6573 unsigned long t0 =
6574 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
6575 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6576
6577 Ker[i-r][j].LoopHole() = long(t0);
6578 }
6579 }
6580
6581 for (long i = 0; i < n-r; i++) {
6582 for (long j = 0; j < n-r; j++) {
6583 Ker[i][j+r].LoopHole() = 0;
6584 }
6585 Ker[i][i+r].LoopHole() = 1;
6586 }
6587
6588 if (pivoting) {
6589 for (long i = 0; i < n-r; i++) {
6590 zz_p *x = Ker[i].elts();
6591
6592 for (long k = n-1; k >= 0; k--) {
6593 long pos = P[k];
6594 if (pos != k) swap(x[pos], x[k]);
6595 }
6596 }
6597 }
6598 }
6599 }
6600
6601 return r;
6602
6603 }
6604
6605
6606 static inline
6607 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk, long src_limit)
6608 {
6609 long src_row = src_blk*MAT_BLK_SZ;
6610 long dst_row = dst_blk*MAT_BLK_SZ;
6611
6612 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
6613
6614 for (long i = 0; i < nrows; i++)
6615 for (long j = 0; j < MAT_BLK_SZ; j++)
6616 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6617
6618 for (long i = nrows; i < MAT_BLK_SZ; i++)
6619 for (long j = 0; j < MAT_BLK_SZ; j++)
6620 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
6621
6622 }
6623
6624 static inline
6625 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk)
6626 {
6627 long src_row = src_blk*MAT_BLK_SZ;
6628 long dst_row = dst_blk*MAT_BLK_SZ;
6629
6630 long nrows = MAT_BLK_SZ;
6631
6632 for (long i = 0; i < nrows; i++)
6633 for (long j = 0; j < MAT_BLK_SZ; j++)
6634 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6635 }
6636
6637 static inline
6638 void TransposeBlock(long *dst_ptr, long dst_blk)
6639 {
6640 dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
6641
6642 for (long i = 0; i < MAT_BLK_SZ; i++)
6643 for (long j = 0; j < i; j++)
6644 _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
6645 }
6646
6647 static inline
6648 void SwapOneRow(long *panelp, long i, long pos)
6649 {
6650 long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
6651 long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
6652 for (long j = 0; j < MAT_BLK_SZ; j++)
6653 _ntl_swap(pos_p[j], i_p[j]);
6654 }
6655
6656 static inline
6657 void ApplySwaps(long *panelp, long start, long end, const Vec<long>& P)
6658 {
6659 for (long i = start; i < end; i++) {
6660 long pos = P[i];
6661 if (pos != i)
6662 SwapOneRow(panelp, i, pos);
6663 }
6664 }
6665
6666
6667 static inline
6668 void MulAddBlock(long *x, const long *y, const long *z,
6669 long p, sp_ll_reduce_struct ll_red_struct)
6670 {
6671 // x += y*z
6672
6673 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ, p, ll_red_struct);
6674 }
6675
6676
6677
6678 static
6679 long elim_blk_LL(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6680 long w, bool full)
6681 {
6682 long n = A.NumRows();
6683 long m = A.NumCols();
6684
6685 if (w < 0 || w > m) LogicError("elim: bad args");
6686
6687 // take care of corner cases
6688 if (n == 0) {
6689 if (im) im->SetDims(0, m);
6690 if (ker) ker->SetDims(0, 0);
6691 return 0;
6692 }
6693
6694 if (w == 0) {
6695 if (im) {
6696 if (full)
6697 (*im) = A;
6698 else
6699 im->SetDims(0, m);
6700 }
6701 if (ker) ident(*ker, n);
6702 return 0;
6703 }
6704
6705 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6706 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6707
6708 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6709
6710
6711 Vec< UniqueArray<long> > M;
6712 M.SetLength(npanels);
6713 for (long panel = 0; panel < npanels; panel++) {
6714 M[panel].SetLength(n*MAT_BLK_SZ);
6715 long *panelp = &M[panel][0];
6716
6717 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
6718 }
6719
6720 // copy A into panels
6721 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
6722 long j_max = min(jj+MAT_BLK_SZ, m);
6723 long *panelp = &M[panel][0];
6724
6725 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6726 const zz_p *ap = A[i].elts() + jj;
6727
6728 for (long j = jj; j < j_max; j++)
6729 panelp[j-jj] = rep(ap[j-jj]);
6730 }
6731 }
6732
6733 UniqueArray<long> aux_panel_store;
6734 aux_panel_store.SetLength(n*MAT_BLK_SZ);
6735 long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
6736
6737
6738 UniqueArray<long> buf_store1;
6739 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6740 long *buf1 = &buf_store1[0];
6741
6742 Vec<long> P;
6743 P.SetLength(n);
6744 for (long k = 0; k < n; k++) P[k] = k;
6745 // records swap operations
6746
6747 Vec<long> pcol;
6748 pcol.SetLength(n);
6749 // pcol[i] records pivot columns for row i
6750
6751 long p = zz_p::modulus();
6752 mulmod_t pinv = zz_p::ModulusInverse();
6753 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
6754
6755 bool pivoting = false;
6756
6757 long r = 0, rr = 0, k = 0, kk = 0;
6758 long rpanel = 0, kpanel = 0;
6759
6760 while (k < w) {
6761
6762 if (r > rr && ker) {
6763 // we have a panel from a previous iteration
6764 // we store enough of it to facilitate the kernel
6765 // computation later. At this point, we have
6766 // r == rr+INV_BLK_SIZE, and it suffices to store
6767 // rows [r..n) into M[rpanel], and this will not
6768 // overwrite anything useful in M[rpanel]
6769
6770 long *panelp = &M[rpanel][0];
6771 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
6772 panelp[h] = aux_panel[h];
6773 }
6774
6775 rpanel++;
6776 }
6777
6778 rr = r;
6779
6780 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
6781
6782 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
6783
6784 if (k == kk+MAT_BLK_SZ) { // start new kpanel
6785 kk = k;
6786 kpanel++;
6787 }
6788
6789 long * NTL_RESTRICT kpanelp = &M[kpanel][0];
6790
6791 if (k == kk) { // a fresh kpanel -- special processing
6792
6793
6794 if (r > rr) {
6795
6796
6797 // apply current sequence of permutations
6798
6799 ApplySwaps(kpanelp, rr, r, P);
6800
6801 // copy rows [rr..r) of kpanel into buf1
6802 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6803 buf1[i] = kpanelp[rr*MAT_BLK_SZ+i];
6804
6805 TransposeBlock(buf1, 0);
6806
6807 // kpanel[rr..n) += aux_panel[rr..n)*buf1
6808
6809 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr, p, ll_red_struct);
6810 }
6811 }
6812
6813 long pos = -1;
6814 long pivot;
6815 long pivot_inv;
6816 for (long i = r; i < n; i++) {
6817 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
6818 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
6819
6820 if (pivot != 0) {
6821 pivot_inv = InvMod(pivot, p);
6822 pos = i;
6823 break;
6824 }
6825 }
6826
6827 if (pos == -1) {
6828 continue;
6829 }
6830
6831 long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
6832 long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
6833 if (r != pos) {
6834 // swap rows pos and r
6835 long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
6836 long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
6837
6838 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6839 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
6840
6841 P[r] = pos;
6842 pivoting = true;
6843 }
6844
6845 // clear column
6846 for (long i = r+1; i < n; i++) {
6847 long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
6848 long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
6849 long t1 = x[k-kk];
6850 t1 = MulMod(t1, pivot_inv, p);
6851 t1 = NegateMod(t1, p);
6852 x[k-kk] = 0;
6853 x1[r-rr] = t1;
6854 if (t1 == 0) continue;
6855
6856 // add t1 * row r to row i
6857 long ut1 = t1;
6858 mulmod_precon_t ut1_pinv = PrepMulModPrecon(ut1, p, pinv);
6859
6860 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
6861 x[j] = AddMod(x[j], MulModPrecon(y[j], ut1, p, ut1_pinv), p);
6862 for (long j = 0; j < r-rr; j++)
6863 x1[j] = AddMod(x1[j], MulModPrecon(y1[j], ut1, p, ut1_pinv), p);
6864 }
6865
6866 pcol[r] = k;
6867 r++;
6868 }
6869
6870 if (r > rr) {
6871
6872 // we have a panel
6873
6874 bool seq =
6875 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
6876
6877 // apply aux_panel to remaining panels: [kpanel+1..npanels)
6878 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6879 NTL_IMPORT(p)
6880 NTL_IMPORT(n)
6881 NTL_IMPORT(ll_red_struct)
6882 NTL_IMPORT(aux_panel)
6883 NTL_IMPORT(rr)
6884 NTL_IMPORT(r)
6885
6886
6887 UniqueArray<long> buf_store;
6888 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6889 long *buf = &buf_store[0];
6890
6891
6892 for (long index = first; index < last; index++) {
6893 long jpanel = index + kpanel+1;
6894
6895 long * NTL_RESTRICT jpanelp = &M[jpanel][0];
6896
6897 // perform swaps
6898 ApplySwaps(jpanelp, rr, r, P);
6899
6900 // copy rows [rr..r) of jpanel into buf
6901 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6902 buf[i] = jpanelp[rr*MAT_BLK_SZ+i];
6903
6904 TransposeBlock(buf, 0);
6905
6906 // jpanel[rr..n) += aux_panel[rr..n)*buf
6907
6908 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr, p, ll_red_struct);
6909 }
6910
6911 NTL_GEXEC_RANGE_END
6912
6913 }
6914
6915 }
6916
6917 if (im) {
6918 mat_zz_p& Im = *im;;
6919 if (full)
6920 Im.SetDims(n, m);
6921 else
6922 Im.SetDims(r, m);
6923
6924 for (long i = 0; i < r; i++) {
6925 long pc = pcol[i];
6926 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6927 for (long j = pc; j < m; j++) {
6928 long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6929 Im[i][j].LoopHole() = t0;
6930 }
6931 }
6932
6933 if (full) {
6934 for (long i = r; i < n; i++) {
6935 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6936 for (long j = w; j < m; j++) {
6937 long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6938 Im[i][j].LoopHole() = t0;
6939 }
6940 }
6941 }
6942 }
6943
6944 if (ker) {
6945 mat_zz_p& Ker = *ker;
6946 Ker.SetDims(n-r, n);
6947 if (r < n) {
6948
6949 long start_block = r/MAT_BLK_SZ;
6950 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6951 long vblocks = end_block-start_block;
6952 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6953
6954 Vec< UniqueArray<long> > kerbuf;
6955 kerbuf.SetLength(vblocks);
6956 for (long i = 0; i < vblocks; i++)
6957 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
6958
6959 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6960
6961 // if r > rr, we have a panel sitting in
6962 // aux_panel, which may or may not be a full panel
6963
6964 long *initial_panel = 0;
6965 if (r > rr) {
6966 initial_panel = aux_panel;
6967 }
6968 else {
6969 initial_panel = &M[hblocks-1][0];
6970 }
6971
6972 for (long vb = start_block; vb < end_block; vb++)
6973 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
6974
6975 for (long hb = hblocks-2; hb >= 0; hb--) {
6976
6977 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
6978
6979 for (long b = hb+1; b < end_block; b++) {
6980 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
6981 TransposeBlock(&M[hb][0], b-1);
6982 }
6983 }
6984
6985 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6986
6987
6988 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
6989 NTL_IMPORT(p)
6990 NTL_IMPORT(ll_red_struct)
6991 NTL_IMPORT(hblocks)
6992
6993 for (long index = first; index < last; index++) {
6994 long vb = index + start_block;
6995 long *kerbufp = &kerbuf[vb-start_block][0];
6996
6997 for (long hb = hblocks-2; hb >= 0; hb--) {
6998 long *colbuf = &M[hb][0];
6999 long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7000
7001 CopyBlock(acc, 0, colbuf, vb-1);
7002 TransposeBlock(acc, 0);
7003
7004 for (long b = hb+1; b < hblocks; b++) {
7005 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7006 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ], p, ll_red_struct);
7007 }
7008 }
7009 }
7010
7011 NTL_GEXEC_RANGE_END
7012
7013 for (long i = r; i < n; i++) {
7014
7015 long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7016
7017 for (long j = 0; j < r; j++) {
7018 long t0 =
7019 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
7020 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7021
7022 Ker[i-r][j].LoopHole() = long(t0);
7023 }
7024 }
7025
7026 for (long i = 0; i < n-r; i++) {
7027 for (long j = 0; j < n-r; j++) {
7028 Ker[i][j+r].LoopHole() = 0;
7029 }
7030 Ker[i][i+r].LoopHole() = 1;
7031 }
7032
7033 if (pivoting) {
7034 for (long i = 0; i < n-r; i++) {
7035 zz_p *x = Ker[i].elts();
7036
7037 for (long k = n-1; k >= 0; k--) {
7038 long pos = P[k];
7039 if (pos != k) swap(x[pos], x[k]);
7040 }
7041 }
7042 }
7043 }
7044 }
7045
7046 return r;
7047
7048 }
7049
7050
7051 #endif
7052
7053
7054
7055 static
7056 long elim(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker, long w, bool full)
7057 {
7058 long n = A.NumRows();
7059 long m = A.NumCols();
7060
7061 if (w < 0 || w > m) LogicError("elim: bad args");
7062
7063 #ifndef NTL_HAVE_LL_TYPE
7064
7065 return elim_basic(A, im, ker, w, full);
7066
7067 #else
7068
7069 long p = zz_p::modulus();
7070
7071 if (n/MAT_BLK_SZ < 4 || w/MAT_BLK_SZ < 4) {
7072 return elim_basic(A, im, ker, w, full);
7073 }
7074 else {
7075 long V = 4*MAT_BLK_SZ;
7076
7077 #ifdef NTL_HAVE_AVX
7078 if (p-1 <= MAX_DBL_INT &&
7079 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
7080 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
7081
7082 return elim_blk_DD(A, im, ker, w, full);
7083 }
7084 else
7085 #endif
7086 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
7087 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
7088
7089 return elim_blk_L(A, im, ker, w, full);
7090
7091 }
7092 else {
7093
7094 return elim_blk_LL(A, im, ker, w, full);
7095 }
7096
7097 }
7098
7099 #endif
7100
7101
7102
7103 }
7104
7105
7106 // ******************************************************************
7107 //
7108 // High level interfaces
7109 //
7110 // ******************************************************************
7111
7112
7113
7114 long gauss(mat_zz_p& M, long w)
7115 {
7116 return elim(M, &M, 0, w, true);
7117 }
7118
7119
7120 long gauss(mat_zz_p& M)
7121 {
7122 return gauss(M, M.NumCols());
7123 }
7124
7125 void image(mat_zz_p& X, const mat_zz_p& A)
7126 {
7127 elim(A, &X, 0, A.NumCols(), false);
7128 }
7129
7130 void kernel(mat_zz_p& X, const mat_zz_p& A)
7131 {
7132 elim(A, 0, &X, A.NumCols(), false);
7133 }
7134
7135
7136 // ******************************************************************
7137 //
7138 // Operator/functional notation
7139 //
7140 // ******************************************************************
7141
7142
7143
7144
7145 mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
7146 {
7147 mat_zz_p res;
7148 add(res, a, b);
7149 NTL_OPT_RETURN(mat_zz_p, res);
7150 }
7151
7152 mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
7153 {
7154 mat_zz_p res;
7155 mul_aux(res, a, b);
7156 NTL_OPT_RETURN(mat_zz_p, res);
7157 }
7158
7159 mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
7160 {
7161 mat_zz_p res;
7162 sub(res, a, b);
7163 NTL_OPT_RETURN(mat_zz_p, res);
7164 }
7165
7166
7167 mat_zz_p operator-(const mat_zz_p& a)
7168 {
7169 mat_zz_p res;
7170 negate(res, a);
7171 NTL_OPT_RETURN(mat_zz_p, res);
7172 }
7173
7174
7175 vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
7176 {
7177 vec_zz_p res;
7178 mul_aux(res, a, b);
7179 NTL_OPT_RETURN(vec_zz_p, res);
7180 }
7181
7182 vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
7183 {
7184 vec_zz_p res;
7185 mul(res, a, b);
7186 NTL_OPT_RETURN(vec_zz_p, res);
7187 }
7188
7189
9487190 NTL_END_IMPL
316316 }
317317
318318
319 void solve(zz_pE& d, vec_zz_pE& X,
320 const mat_zz_pE& A, const vec_zz_pE& b)
319 static
320 void solve_impl(zz_pE& d, vec_zz_pE& X, const mat_zz_pE& A, const vec_zz_pE& b, bool trans)
321321
322322 {
323323 long n = A.NumRows();
346346
347347 for (i = 0; i < n; i++) {
348348 M[i].SetLength(n+1);
349 for (j = 0; j < n; j++) {
350 M[i][j].rep.SetMaxLength(2*deg(p)-1);
351 M[i][j] = rep(A[j][i]);
349 if (trans) {
350 for (j = 0; j < n; j++) {
351 M[i][j].rep.SetMaxLength(2*deg(p)-1);
352 M[i][j] = rep(A[j][i]);
353 }
354 }
355 else {
356 for (j = 0; j < n; j++) {
357 M[i][j].rep.SetMaxLength(2*deg(p)-1);
358 M[i][j] = rep(A[i][j]);
359 }
352360 }
353361 M[i][n].rep.SetMaxLength(2*deg(p)-1);
354362 M[i][n] = rep(b[i]);
418426 }
419427
420428 conv(d, det);
429 }
430
431 void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b)
432 {
433 solve_impl(d, x, A, b, true);
434 }
435
436 void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b)
437 {
438 solve_impl(d, x, A, b, false);
421439 }
422440
423441 void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A)
0
1 #include <NTL/mat_lzz_p.h>
2
3 NTL_CLIENT
4
5
6
7 void FillRandom(Mat<zz_p>& A)
8 {
9 long n = A.NumRows();
10 long m = A.NumCols();
11 for (long i = 0; i < n; i++)
12 for (long j = 0; j < m; j++)
13 random(A[i][j]);
14 }
15
16 void FillRandom1(Mat<zz_p>& A)
17 {
18 long n = A.NumRows();
19 long m = A.NumCols();
20 for (long j = 0; j < m; j++) {
21 if (j > 0 && RandomBnd(2)) {
22 for (long i = 0; i < n; i++)
23 A[i][j] = A[i][j-1];
24 }
25 else {
26 for (long i = 0; i < n; i++)
27 random(A[i][j]);
28 }
29 }
30 }
31
32 void FillRandom(Vec<zz_p>& A)
33 {
34 long n = A.length();
35 for (long i = 0; i < n; i++)
36 random(A[i]);
37 }
38
39 long old_gauss(mat_zz_p& M, long w)
40 {
41 using NTL_NAMESPACE::negate;
42 long k, l;
43 long i, j;
44 long pos;
45 zz_p t1, t2, t3;
46 zz_p *x, *y;
47
48 long n = M.NumRows();
49 long m = M.NumCols();
50
51 if (w < 0 || w > m)
52 LogicError("gauss: bad args");
53
54 long p = zz_p::modulus();
55 mulmod_t pinv = zz_p::ModulusInverse();
56 long T1, T2;
57
58 l = 0;
59 for (k = 0; k < w && l < n; k++) {
60
61 pos = -1;
62 for (i = l; i < n; i++) {
63 if (!IsZero(M[i][k])) {
64 pos = i;
65 break;
66 }
67 }
68
69 if (pos != -1) {
70 swap(M[pos], M[l]);
71
72 inv(t3, M[l][k]);
73 negate(t3, t3);
74
75 for (i = l+1; i < n; i++) {
76 // M[i] = M[i] + M[l]*M[i,k]*t3
77
78 mul(t1, M[i][k], t3);
79
80 T1 = rep(t1);
81 mulmod_precon_t T1pinv = PrepMulModPrecon(T1, p, pinv);
82
83 clear(M[i][k]);
84
85 x = M[i].elts() + (k+1);
86 y = M[l].elts() + (k+1);
87
88 for (j = k+1; j < m; j++, x++, y++) {
89 // *x = *x + (*y)*t1
90
91 T2 = MulModPrecon(rep(*y), T1, p, T1pinv);
92 T2 = AddMod(T2, rep(*x), p);
93 (*x).LoopHole() = T2;
94 }
95 }
96
97 l++;
98 }
99 }
100
101 return l;
102 }
103
104 long old_gauss(mat_zz_p& M)
105 {
106 return old_gauss(M, M.NumCols());
107 }
108
109 void old_image(mat_zz_p& X, const mat_zz_p& A)
110 {
111 mat_zz_p M;
112 M = A;
113 long r = old_gauss(M);
114 M.SetDims(r, M.NumCols());
115 X = M;
116 }
117
118 int main(int argc, char **argv)
119 {
120 long iters = 100;
121
122
123 #if 1
124 cerr << "testing multiplication";
125 for (long cnt = 0; cnt < iters; cnt++) {
126 cerr << ".";
127
128 long bnd = (cnt%2) ? 25 : 2000;
129
130 long len = RandomBnd(NTL_SP_NBITS-3)+4;
131 long n = RandomBnd(bnd);
132 long l = RandomBnd(bnd);
133 long m = RandomBnd(bnd);
134
135 long p = RandomPrime_long(len);
136 zz_p::init(p);
137
138 Mat<zz_p> A, B, X;
139
140 A.SetDims(n, l);
141 B.SetDims(l, m);
142
143 FillRandom(A);
144 FillRandom(B);
145
146 X.SetDims(n, m);
147
148 vec_zz_p R;
149
150 R.SetLength(m);
151 for (long i = 0; i < m; i++) random(R[i]);
152
153 mul(X, A, B);
154
155 if (X*R != A*(B*R))
156 cerr << "*\n*\n*\n*\n*\n*********** oops " << len << " " << n << " " << l << " "
157 << m << "\n";
158 }
159 #endif
160
161 #if 1
162 cerr << "\ntesting inversion";
163 for (long cnt = 0; cnt < iters; cnt++) {
164 cerr << ".";
165 long bnd = (cnt%2) ? 25 : 1500;
166
167 long len = RandomBnd(NTL_SP_NBITS-3)+4;
168 long n = RandomBnd(bnd);
169
170 long p = RandomPrime_long(len);
171 zz_p::init(p);
172
173 Mat<zz_p> A, X;
174
175 A.SetDims(n, n);
176
177 FillRandom(A);
178
179
180 vec_zz_p R;
181
182 R.SetLength(n);
183 for (long i = 0; i < n; i++) random(R[i]);
184
185 zz_p d;
186
187 inv(d, X, A);
188
189 if (d != 0) {
190 if (R != A*(X*R))
191 cerr << "\n*\n*\n*\n*\n*********** oops " << len << " " << n << "\n";
192 }
193 else {
194 cerr << "[singular]";
195 }
196 }
197 #endif
198
199 #if 1
200 cerr << "\ntesting solve";
201 for (long cnt = 0; cnt < iters; cnt++) {
202 cerr << ".";
203 long bnd = (cnt%2) ? 25 : 2000;
204
205 long len = RandomBnd(NTL_SP_NBITS-3)+4;
206 long n = RandomBnd(bnd);
207
208 long p = RandomPrime_long(len);
209 zz_p::init(p);
210
211 Mat<zz_p> A;
212
213 A.SetDims(n, n);
214 FillRandom(A);
215
216 Vec<zz_p> x, b;
217 b.SetLength(n);
218 FillRandom(b);
219
220 zz_p d;
221
222 solve(d, A, x, b);
223
224 if (d != 0) {
225 if (A*x != b)
226 cerr << "\n*\n*\n*\n*\n*********** oops " << len << " " << n << "\n";
227 }
228 else {
229 cerr << "[singular]";
230 }
231 }
232 #endif
233
234 #if 1
235 cerr << "\ntesting image and kernel";
236 for (long cnt = 0; cnt < iters; cnt++) {
237 cerr << ".";
238 long bnd = (cnt%2) ? 25 : 1500;
239
240 long len = RandomBnd(NTL_SP_NBITS-3)+4;
241 long n = RandomBnd(bnd);
242 long m = RandomBnd(bnd);
243
244 long p = RandomPrime_long(len);
245 zz_p::init(p);
246
247 Mat<zz_p> A;
248
249 A.SetDims(n, m);
250 FillRandom1(A);
251
252 Mat<zz_p> im, im1, ker1;
253
254 old_image(im, A);
255 image(im1, A);
256 kernel(ker1, A);
257
258
259 if (im != im1 || !IsZero(ker1*A) || im1.NumRows() + ker1.NumRows() != n) {
260 cerr << "\n*\n*\n*\n*\n*********** oops " << len << " " << n << m << "\n";
261 }
262 }
263 #endif
264
265 cerr << "\n";
266
267 }
268
1212
1313 CXXFLAGS=@{CXXFLAGS}
1414 # Flags for the C++ compiler
15
16 CXXAUTOFLAGS=@{CXXAUTOFLAGS}
17 # Flags for the C++ compiler, automatically generated by configuration script
1518
1619
1720 AR=@{AR}
136139 O16=$(O15)
137140 O17=$(O16)
138141 O18=$(O17) xdouble.o
139 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
142 O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140143
141144 OBJ=$(O19)
142145
161164 S16=$(S15)
162165 S17=$(S16)
163166 S18=$(S17) xdouble.c
164 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
167 S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165168
166169 SRC = $(S19)
167170
193196 IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194197 IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195198 IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197 IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199 INCL=$(IN20)
199 IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
200 INCL=$(IN19)
200201
201202
202203
212213 # test source files
213214
214215 TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215 TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
216 TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216217 TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217218 TS4=$(TS3) ThreadTest.c ExceptionTest.c
218219 TS = $(TS4)
219220
220221 # scripts
221222
222 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
223 SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223224 SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224225
225226 SCRIPTS=$(SCRIPTS2)
226227
227228 # auxilliary source
228229
229 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230 GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
230 MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
231 GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231232 GP=GetPID1.c GetPID2.c TestGetPID.c
232 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
233 CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
234
235 AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233236
234237
235238
236239 # documentation
237240
238241
239 D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
242 D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240243 D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241244 D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242245 D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt
252255 D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253256 D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254257 D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255 D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
258 D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256259 D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257260
258261 TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt
263266 TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264267 TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265268 TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
269 TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267270
268271 TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269272
275278 HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276279 HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277280 HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
281 HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279282
280283 HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281284
287290 # test program executables
288291
289292 PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290 PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
293 PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291294 PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292295 PROGS = $(PROG3)
293296
294297 # things to save to a tar file
295298
296299 SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297 SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
300 SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298301 SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299302 SFILES=$(SFI3)
300303
309312 NTL_INCLUDE = -I../include -I.
310313 # NTL needs this to find its include files
311314
312 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
315 COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
316
317 LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315318
316319
317320
341344 # setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342345
343346 setup2:
347 echo "*** CheckFeature log ***" > CheckFeature.log
344348 sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345349 sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346 sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347 sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
350 sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
351 sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
352 sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
353 sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348354
349355 # setup3 generates the file ../include/NTL/gmp_aux.h
350356 # The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h
382388 GetPID.o: GetPID.c
383389 $(LCOMP) $(COMPILE) GetPID.c
384390
385 CheckPCLMUL: CheckPCLMUL.c
386 $(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
391 CheckCompile: CheckCompile.c
392 $(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
393
387394
388395 .c.o:
389396 $(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<
460467
461468 clobber:
462469 rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463 cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464 cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
470 sh ResetFeatures '..'
465471 rm -f ../include/NTL/gmp_aux.h
466 sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
472 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467473 rm -f *.o
468474 rm -rf small
469475 rm -f cfileout mfileout
471477 rm -f all
472478
473479 clean:
474 sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
480 sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475481 rm -f *.o
476482 rm -rf small
477483 @{LSHAR} - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR
497503
498504
499505 package:
506 ./configure --nowrite
507 cp mfileout def_makefile
508 cp cfileout ../include/NTL/def_config.h
500509 sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501510 rm -rf `cat DIRNAME`
502511 rm -f `cat DIRNAME`.tar
508517 rm -rf `cat DIRNAME`
509518
510519 winpack:
520 ./configure --nowrite NTL_GMP_LIP=off
521 cp mfileout def_makefile
522 cp cfileout ../include/NTL/def_config.h
511523 sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512524 rm -rf `cat WINDIR`
513525 rm -f `cat WINDIR`.zip
526538
527539 WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528540 WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
541 WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530542
531543 WOBJ = $(WO3)
532544
538550 MulTimeTest:
539551 $(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540552
541 PolyTimeTest:
542 $(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543553
544554 Poly1TimeTest:
545555 $(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
556 Poly2TimeTest:
557 $(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
558 Poly3TimeTest:
559 $(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546560
547561
548562 GF2XTimeTest:
77 do
88 name=`basename $i .txt`
99 cp $name.txt $name.cpp
10 $VIM $name.cpp '+set nu!' '+TOhtml' '+:1,$s/.*@anchor{\(.*\)}.*/<a name="\1"><\/a>/' '+w' '+qa!'
10 $VIM $name.cpp '+set nu!' '+let c_no_curly_error=1' '+syntax off' '+syntax on' '+TOhtml' '+:1,$s/.*@anchor{\(.*\)}.*/<a name="\1"><\/a>/' '+w' '+qa!'
1111 done
1212
1313
4343 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
4444 */
4545
46
47
48 #ifdef __INTEL_COMPILER
49 #pragma float_control(precise,on)
50 #endif
51
52 // NOTE: the above will force the Intel compiler to adhere to
53 // language standards, which it does not do by default
54
4655 #include <NTL/quad_float.h>
4756 #include <NTL/RR.h>
4857
112121
113122
114123 #if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
124
125
115126 quad_float to_quad_float(long n)
116127 {
117 START_FIX
118128 DOUBLE xhi, xlo;
119 DOUBLE u, v;
120
121 xhi = double(n);
129
130 xhi = TrueDouble(n);
122131
123132 // Because we are assuming 2's compliment integer
124133 // arithmetic, the following prevents long(xhi) from overflowing.
125134
126135 if (n > 0)
127 xlo = double(n+long(-xhi));
136 xlo = TrueDouble(n+long(-xhi));
128137 else
129 xlo = double(n-long(xhi));
138 xlo = TrueDouble(n-long(xhi));
130139
131140 // renormalize...just to be safe
132
133 u = xhi + xlo;
134 v = xhi - u;
135 v = v + xlo;
136 END_FIX
137 return quad_float(u, v);
141
142 quad_float z;
143 normalize(z, xhi, xlo);
144 return z;
138145 }
139146
140147 quad_float to_quad_float(unsigned long n)
141148 {
142 START_FIX
143149 DOUBLE xhi, xlo, t;
144 DOUBLE u, v;
145150
146151 const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
147152
148 xhi = double(n);
153 xhi = TrueDouble(n);
149154
150155 if (xhi >= bnd)
151156 t = xhi - bnd;
154159
155160 // we use the "to_long" function here to be as portable as possible.
156161 long llo = to_long(n - (unsigned long)(t));
157 xlo = double(llo);
158
159 // renormalize...just to be safe
160
161 u = xhi + xlo;
162 v = xhi - u;
163 v = v + xlo;
164 END_FIX
165 return quad_float(u, v);
162 xlo = TrueDouble(llo);
163
164 quad_float z;
165 normalize(z, xhi, xlo);
166 return z;
166167 }
167168 #endif
168169
169170
170 NTL_THREAD_LOCAL
171 NTL_CHEAP_THREAD_LOCAL
171172 long quad_float::oprec = 10;
172173
173174 void quad_float::SetOutputPrecision(long p)
351352 }
352353
353354
355
356 #if (NTL_FMA_DETECTED)
357
358 double quad_float_zero = 0;
359
360 static inline
361 double Protect(double x) { return x + quad_float_zero; }
362
363 #else
364
365
366 static inline
367 double Protect(double x) { return x; }
368
369
370 #endif
371
372 // NOTE: this is really sick: some compilers will issue FMA
373 // (fused mul add) instructions which will break correctness.
374 // C99 standard is supposed to prevent this across separate
375 // statements, but C++ standard doesn't guarantee much at all.
376 // In any case, gcc does not even implement the C99 standard
377 // correctly. One could disable this by compiling with
378 // an appropriate flag: -mno-fma works for gcc, while -no-fma works
379 // for icc. icc and MSVC++ also support pragmas to do this:
380 // #pragma fp_contract(off). There is also a compiler flag for
381 // gcc: -ffp-contract=off, but -mno-fma seems more widely supported.
382 // These flags work for clang, as well.
383 //
384 // But in any case, I'd rather not mess with getting these flags right.
385 // By calling Protect(a*b), this has the effect of forcing the
386 // compiler to compute a*b + 0. Assuming the compiler otherwise
387 // does not perform any re-association, this should do the trick.
388 // There is a small performance penalty, but it should be reasonable.
389
390
391
354392 quad_float operator *(const quad_float& x,const quad_float& y ) {
355393 START_FIX
356394 DOUBLE hx, tx, hy, ty, C, c;
357395 DOUBLE t1, t2;
358396
359 C = NTL_QUAD_FLOAT_SPLIT*x.hi;
397 C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
360398 hx = C-x.hi;
361 c = NTL_QUAD_FLOAT_SPLIT*y.hi;
399 c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
362400 hx = C-hx;
363401 tx = x.hi-hx;
364402 hy = c-y.hi;
365 C = x.hi*y.hi;
403 C = Protect(x.hi*y.hi);
366404 hy = c-hy;
367405 ty = y.hi-hy;
368406
369407 // c = ((((hx*hy-C)+hx*ty)+tx*hy)+tx*ty)+(x.hi*y.lo+x.lo*y.hi);
370408
371 t1 = hx*hy;
409 t1 = Protect(hx*hy);
372410 t1 = t1-C;
373 t2 = hx*ty;
411 t2 = Protect(hx*ty);
374412 t1 = t1+t2;
375 t2 = tx*hy;
413 t2 = Protect(tx*hy);
376414 t1 = t1+t2;
377 t2 = tx*ty;
415 t2 = Protect(tx*ty);
378416 c = t1+t2;
379 t1 = x.hi*y.lo;
380 t2 = x.lo*y.hi;
417 t1 = Protect(x.hi*y.lo);
418 t2 = Protect(x.lo*y.hi);
381419 t1 = t1+t2;
382420 c = c + t1;
383421
395433 DOUBLE hx, tx, hy, ty, C, c;
396434 DOUBLE t1, t2;
397435
398 C = NTL_QUAD_FLOAT_SPLIT*x.hi;
436 C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
399437 hx = C-x.hi;
400 c = NTL_QUAD_FLOAT_SPLIT*y.hi;
438 c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
401439 hx = C-hx;
402440 tx = x.hi-hx;
403441 hy = c-y.hi;
404 C = x.hi*y.hi;
442 C = Protect(x.hi*y.hi);
405443 hy = c-hy;
406444 ty = y.hi-hy;
407445
408446 // c = ((((hx*hy-C)+hx*ty)+tx*hy)+tx*ty)+(x.hi*y.lo+x.lo*y.hi);
409447
410 t1 = hx*hy;
448 t1 = Protect(hx*hy);
411449 t1 = t1-C;
412 t2 = hx*ty;
450 t2 = Protect(hx*ty);
413451 t1 = t1+t2;
414 t2 = tx*hy;
452 t2 = Protect(tx*hy);
415453 t1 = t1+t2;
416 t2 = tx*ty;
454 t2 = Protect(tx*ty);
417455 c = t1+t2;
418 t1 = x.hi*y.lo;
419 t2 = x.lo*y.hi;
456 t1 = Protect(x.hi*y.lo);
457 t2 = Protect(x.lo*y.hi);
420458 t1 = t1+t2;
421459 c = c + t1;
422460
437475 DOUBLE t1;
438476
439477 C = x.hi/y.hi;
440 c = NTL_QUAD_FLOAT_SPLIT*C;
478 c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
441479 hc = c-C;
442 u = NTL_QUAD_FLOAT_SPLIT*y.hi;
480 u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
443481 hc = c-hc;
444482 tc = C-hc;
445483 hy = u-y.hi;
446 U = C * y.hi;
484 U = Protect(C * y.hi);
447485 hy = u-hy;
448486 ty = y.hi-hy;
449487
450488 // u = (((hc*hy-U)+hc*ty)+tc*hy)+tc*ty;
451489
452 u = hc*hy;
490 u = Protect(hc*hy);
453491 u = u-U;
454 t1 = hc*ty;
492 t1 = Protect(hc*ty);
455493 u = u+t1;
456 t1 = tc*hy;
494 t1 = Protect(tc*hy);
457495 u = u+t1;
458 t1 = tc*ty;
496 t1 = Protect(tc*ty);
459497 u = u+t1;
460498
461499 // c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;
463501 c = x.hi-U;
464502 c = c-u;
465503 c = c+x.lo;
466 t1 = C*y.lo;
504 t1 = Protect(C*y.lo);
467505 c = c - t1;
468506 c = c/y.hi;
469507
481519 DOUBLE t1;
482520
483521 C = x.hi/y.hi;
484 c = NTL_QUAD_FLOAT_SPLIT*C;
522 c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
485523 hc = c-C;
486 u = NTL_QUAD_FLOAT_SPLIT*y.hi;
524 u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
487525 hc = c-hc;
488526 tc = C-hc;
489527 hy = u-y.hi;
490 U = C * y.hi;
528 U = Protect(C * y.hi);
491529 hy = u-hy;
492530 ty = y.hi-hy;
493531
494532 // u = (((hc*hy-U)+hc*ty)+tc*hy)+tc*ty;
495533
496 u = hc*hy;
534 u = Protect(hc*hy);
497535 u = u-U;
498 t1 = hc*ty;
536 t1 = Protect(hc*ty);
499537 u = u+t1;
500 t1 = tc*hy;
538 t1 = Protect(tc*hy);
501539 u = u+t1;
502 t1 = tc*ty;
540 t1 = Protect(tc*ty);
503541 u = u+t1;
504542
505543 // c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;
507545 c = x.hi-U;
508546 c = c-u;
509547 c = c+x.lo;
510 t1 = C*y.lo;
548 t1 = Protect(C*y.lo);
511549 c = c - t1;
512550 c = c/y.hi;
513551
536574 DOUBLE p,q,hx,tx,u,uu,cc;
537575 DOUBLE t1;
538576
539 p = NTL_QUAD_FLOAT_SPLIT*c;
577 p = Protect(NTL_QUAD_FLOAT_SPLIT*c);
540578 hx = (c-p);
541579 hx = hx+p;
542580 tx = c-hx;
543 p = hx*hx;
544 q = hx*tx;
581 p = Protect(hx*hx);
582 q = Protect(hx*tx);
545583 q = q+q;
546584
547585 u = p+q;
548586 uu = p-u;
549587 uu = uu+q;
550 t1 = tx*tx;
588 t1 = Protect(tx*tx);
551589 uu = uu+t1;
552590
553591
694732 RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
695733 RR::SetOutputPrecision(quad_float::oprec);
696734
697 NTL_THREAD_LOCAL static RR t;
735 NTL_TLS_LOCAL(RR, t);
698736
699737 conv(t, a);
700738 s << t;
707745 RRPush push;
708746 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
709747
710 NTL_THREAD_LOCAL static RR t;
748 NTL_TLS_LOCAL(RR, t);
711749 NTL_INPUT_CHECK_RET(s, s >> t);
712750 conv(x, t);
713751
719757 RRPush push;
720758 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
721759
722 NTL_THREAD_LOCAL static RR t;
760 NTL_TLS_LOCAL(RR, t);
723761 random(t);
724762 conv(x, t);
725763 }
812850 RRPush push;
813851 RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
814852
815 NTL_THREAD_LOCAL static RR t;
853 NTL_TLS_LOCAL(RR, t);
816854 conv(t, s);
817855 conv(x, t);
818856
855893 ResourceError("exp(quad_float): overflow");
856894 }
857895
858 // changed this from "const" to "static" in v5.3, since "const"
859 // causes the initialization to be performed with *every* invocation.
860 NTL_THREAD_LOCAL static quad_float Log2 =
861 to_quad_float("0.6931471805599453094172321214581765680755");
896 static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
897 // GLOBAL (assumes C++11 thread-safe init)
862898
863899 quad_float y,temp,ysq,sum1,sum2;
864900 long iy;
911947
912948
913949 NTL_END_IMPL
950
1414
1515 const string& CurrentThreadID()
1616 {
17 NTL_THREAD_LOCAL static string ID;
18 NTL_THREAD_LOCAL static bool initialized = false;
17 NTL_TLS_LOCAL(string, ID);
18 static NTL_CHEAP_THREAD_LOCAL bool initialized = false;
1919
2020 if (!initialized) {
2121 #ifdef NTL_THREADS
99
1010 NTL_START_IMPL
1111
12 NTL_THREAD_LOCAL void (*ErrorCallback)() = 0;
13 NTL_THREAD_LOCAL void (*ErrorMsgCallback)(const char *) = 0;
12 NTL_CHEAP_THREAD_LOCAL void (*ErrorCallback)() = 0;
13 NTL_CHEAP_THREAD_LOCAL void (*ErrorMsgCallback)(const char *) = 0;
1414
1515
1616 void TerminalError(const char *s)
2525 done
2626
2727 cp ../include/NTL/def_config.h unix/include/NTL/config.h
28 cp ../include/NTL/have_LL_no.h unix/include/NTL/have_LL.h
29 cp ../include/NTL/have_builtin_clzl_no.h unix/include/NTL/have_builtin_clzl.h
30
3128 cp def_makefile unix/src/makefile
29 sh ResetFeatures unix
3232
3333 void mul(vec_RR& x, const vec_RR& a, double b_in)
3434 {
35 NTL_THREAD_LOCAL static RR b;
35 NTL_TLS_LOCAL(RR, b);
3636 conv(b, b_in);
3737 long n = a.length();
3838 x.SetLength(n);
1717 long p = zz_p::modulus();
1818
1919 for (i = 0; i < n; i++)
20 xp[i].LoopHole() = rem(a[i], p);
20 xp[i].LoopHole() = rem(ap[i], p);
21 }
22 //
23 // NOTE: the signature for this is in lzz_p.h
24 void conv(vec_zz_p& x, const Vec<long>& a)
25 {
26 long i, n;
27
28 n = a.length();
29 x.SetLength(n);
30
31 zz_p* xp = x.elts();
32 const long* ap = a.elts();
33
34 long p = zz_p::modulus();
35 sp_reduce_struct red_struct = zz_p::red_struct();
36
37 for (i = 0; i < n; i++)
38 xp[i].LoopHole() = rem(ap[i], p, red_struct);
2139 }
2240
2341
88
99
1010
11 NTL_THREAD_LOCAL
11 NTL_CHEAP_THREAD_LOCAL
1212 long xdouble::oprec = 10;
1313
1414 void xdouble::SetOutputPrecision(long p)
270270 RRPush push;
271271 RR::SetPrecision(NTL_DOUBLE_PRECISION);
272272
273 NTL_THREAD_LOCAL static RR t;
273 NTL_TLS_LOCAL(RR, t);
274274 conv(t, a);
275275
276276 double x;
293293 RRPush push;
294294 RR::SetPrecision(NTL_DOUBLE_PRECISION);
295295
296 NTL_THREAD_LOCAL static RR t;
296 NTL_TLS_LOCAL(RR, t);
297297 conv(t, b);
298298 conv(x, t);
299299 }
512512
513513 double log(const xdouble& a)
514514 {
515 NTL_THREAD_LOCAL static double LogBound = log(NTL_XD_BOUND);
515 static const double LogBound = log(NTL_XD_BOUND); // GLOBAL (assumes C++11 thread-safe init)
516516 if (a.x <= 0) {
517517 ArithmeticError("log(xdouble): argument must be positive");
518518 }
565565
566566 xdouble PowerOf10(const ZZ& e)
567567 {
568 NTL_THREAD_LOCAL static long init = 0;
569 NTL_THREAD_LOCAL static xdouble v10k;
570 NTL_THREAD_LOCAL static long k;
568 static NTL_CHEAP_THREAD_LOCAL long init = 0;
569 static NTL_CHEAP_THREAD_LOCAL long k = 0;
570
571 NTL_TLS_LOCAL(xdouble, v10k);
571572
572573 if (!init) {
573574 k = ComputeMax10Power();