Commit 55c369b715ebaf38a7de1bdcd37041e97dde3bd6 - ntl

Import ntl_9.9.0.orig.tar.gz Julien Puydt 7 years ago

238 changed file(s) with 18542 addition(s) and 3786 deletion(s). Raw diff Collapse all Expand all

-2

README less more

0		NTL -- a library for doing numbery theory -- version 9.3.0
1		Release date: 2015.7.9
	0	NTL -- a library for doing numbery theory -- version 9.9.0
	1	Release date: 2016.05.30
2	2
3	3	Author: Victor Shoup (victor@shoup.net)
4	4

+384

-0

doc/BasicThreadPool.cpp.html less more

	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
	1	<html>
	2	<head>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/BasicThreadPool.cpp.html</title>
	4	<meta name="Generator" content="Vim/7.1">
	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
	6	</head>
	7	<body bgcolor="#ffffff" text="#000000"><font face="monospace">
	8	<br>
	9	<br>
	10	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>**********************************************************************</i></font><br>
	11	<br>
	12	<font color="#0000ed"><i>MODULE: BasicThreadPool</i></font><br>
	13	<br>
	14	<font color="#0000ed"><i>SUMMARY:</i></font><br>
	15	<br>
	16	<font color="#0000ed"><i>A simple thread pool class BasicThreadPool, as well as some higher-level macros</i></font><br>
	17	<font color="#0000ed"><i>which facilitite simple parallel for loops.</i></font><br>
	18	<br>
	19	<br>
	20	<font color="#0000ed"><i>*************************************************************************</i></font><font color="#0000ed"><i>/</i></font><br>
	21	<br>
	22	<br>
	23	<font color="#0000ed"><i>// ******************** Simple parallel for loops ************************</i></font><br>
	24	<font color="#0000ed"><i>// </i></font><br>
	25	<font color="#0000ed"><i>// We begin with a description of the higher-level macros for writing simple</i></font><br>
	26	<font color="#0000ed"><i>// parallel for loops.  These facilitaties are activated only when NTL is</i></font><br>
	27	<font color="#0000ed"><i>// configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).</i></font><br>
	28	<font color="#0000ed"><i>// However, code that uses these facilties should still compile and run</i></font><br>
	29	<font color="#0000ed"><i>// correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so</i></font><br>
	30	<font color="#0000ed"><i>// this is the simplest way to write parallel for loops across a range of</i></font><br>
	31	<font color="#0000ed"><i>// compile-time and run-time environments.  Note that if NTL_THREADS=on, C++11</i></font><br>
	32	<font color="#0000ed"><i>// features are reqired, but when NTL_THREADS=off, these features are not</i></font><br>
	33	<font color="#0000ed"><i>// required, so the code should compile on older C++ compilers.</i></font><br>
	34	<font color="#0000ed"><i>// </i></font><br>
	35	<font color="#0000ed"><i>// Here is a simple recipe for writing parallel for loop.</i></font><br>
	36	<font color="#0000ed"><i>// </i></font><br>
	37	<font color="#0000ed"><i>// At the start of program execution, your program should execute</i></font><br>
	38	<br>
	39	SetNumThreads(nt);<br>
	40	<br>
	41	<font color="#0000ed"><i>// You can choose nt to be any positive integer, but for best results, it</i></font><br>
	42	<font color="#0000ed"><i>// should correspond to the number of available cores on your machine.</i></font><br>
	43	<font color="#0000ed"><i>// [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does</i></font><br>
	44	<font color="#0000ed"><i>// nothing.]</i></font><br>
	45	<font color="#0000ed"><i>// </i></font><br>
	46	<font color="#0000ed"><i>// Now consider the following routine:</i></font><br>
	47	<br>
	48	<font color="#008b00"><b>void</b></font> mul(ZZ x, <font color="#008b00"><b>const</b></font> ZZ a, <font color="#008b00"><b>const</b></font> ZZ *b, <font color="#008b00"><b>long</b></font> n) <br>
	49	{<br>
	50	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = <font color="#ff8b00">0</font>; i < n; i++)<br>
	51	mul(x[i], a[i], b[i]);<br>
	52	}<br>
	53	<br>
	54	<font color="#0000ed"><i>// We can parallelize it as follows:</i></font><br>
	55	<br>
	56	<font color="#008b00"><b>void</b></font> mul(ZZ x, <font color="#008b00"><b>const</b></font> ZZ a, <font color="#008b00"><b>const</b></font> ZZ *b, <font color="#008b00"><b>long</b></font> n) <br>
	57	{<br>
	58	NTL_EXEC_RANGE(n, first, last) <br>
	59	<br>
	60	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++)<br>
	61	mul(x[i], a[i], b[i]);<br>
	62	<br>
	63	NTL_EXEC_RANGE_END<br>
	64	}<br>
	65	<br>
	66	<font color="#0000ed"><i>// NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just "do the right</i></font><br>
	67	<font color="#0000ed"><i>// thing".  If there are nt threads available, the interval [0..n) will be</i></font><br>
	68	<font color="#0000ed"><i>// partitioned into (up to)  nt subintervals, and a different thread will be</i></font><br>
	69	<font color="#0000ed"><i>// used to process each subinterval. You still have to write the for loop</i></font><br>
	70	<font color="#0000ed"><i>// yourself: the macro just declares and initializes variables "first" and</i></font><br>
	71	<font color="#0000ed"><i>// "last" (or whatever you want to call them) of type long that represent the</i></font><br>
	72	<font color="#0000ed"><i>// subinterval [first..last) to be processed by one thread.</i></font><br>
	73	<font color="#0000ed"><i>// </i></font><br>
	74	<font color="#0000ed"><i>// Note that the current thread participates as one of the nt available</i></font><br>
	75	<font color="#0000ed"><i>// threads, and that the current thread will wait for all participating threads</i></font><br>
	76	<font color="#0000ed"><i>// to finish their task before proceeding.</i></font><br>
	77	<font color="#0000ed"><i>// </i></font><br>
	78	<font color="#0000ed"><i>// Withing the "body" of this construct, you can freely reference any variables</i></font><br>
	79	<font color="#0000ed"><i>// that are visible at this point.  This is implemented using the C++ lambda</i></font><br>
	80	<font color="#0000ed"><i>// feature (capturing all variables by reference).</i></font><br>
	81	<font color="#0000ed"><i>// </i></font><br>
	82	<font color="#0000ed"><i>// This construct will still work even if threads are disabled, in which case</i></font><br>
	83	<font color="#0000ed"><i>// it runs single-threaded with first=0 and last=n.</i></font><br>
	84	<font color="#0000ed"><i>// </i></font><br>
	85	<font color="#0000ed"><i>// Note that the code within the EXEC_RANGE body could call other routines that</i></font><br>
	86	<font color="#0000ed"><i>// themselves attempt to execute an EXEC_RANGE: if this happens, the latter</i></font><br>
	87	<font color="#0000ed"><i>// EXEC_RANGE will detect this and run single-threaded.</i></font><br>
	88	<font color="#0000ed"><i>// </i></font><br>
	89	<font color="#0000ed"><i>// You may wish to do other things within the EXEC_RANGE body than just execute</i></font><br>
	90	<font color="#0000ed"><i>// a loop.  One thing you may want to do is to declare variables.  Another</i></font><br>
	91	<font color="#0000ed"><i>// thing you may want to do is setup a local context for a ZZ_p modulus (or</i></font><br>
	92	<font color="#0000ed"><i>// other type of modulus).  Here is an example of doing this:</i></font><br>
	93	<br>
	94	<br>
	95	<font color="#008b00"><b>void</b></font> mul(ZZ_p x, <font color="#008b00"><b>const</b></font> ZZ_p a, <font color="#008b00"><b>const</b></font> ZZ_p *b, <font color="#008b00"><b>long</b></font> n) <br>
	96	{<br>
	97	ZZ_pContext context;<br>
	98	context.save();<br>
	99	<br>
	100	NTL_EXEC_RANGE(n, first, last) <br>
	101	<br>
	102	context.restore();<br>
	103	<br>
	104	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++)<br>
	105	mul(x[i], a[i], b[i]);<br>
	106	<br>
	107	NTL_EXEC_RANGE_END<br>
	108	}<br>
	109	<br>
	110	<br>
	111	<font color="#0000ed"><i>// Another useful function is AvailableThreads(), which will return the number</i></font><br>
	112	<font color="#0000ed"><i>// of available threads.  If threads or thread boosting is not enabled, this</i></font><br>
	113	<font color="#0000ed"><i>// will return 1.  Even if thread boosting is enabled, this may return 1 if for</i></font><br>
	114	<font color="#0000ed"><i>// whatever reason, the thread pool is not available for use (for example,</i></font><br>
	115	<font color="#0000ed"><i>// SetNumThreads was never called, or the thread pool is already active).</i></font><br>
	116	<font color="#0000ed"><i>// </i></font><br>
	117	<font color="#0000ed"><i>// A lower-level set of tools is available, which allow you to simply run a</i></font><br>
	118	<font color="#0000ed"><i>// specified number of threads.  Assuming nt <= AvailableThreads(), the code</i></font><br>
	119	<br>
	120	NTL_EXEC_INDEX(nt, index)<br>
	121	<br>
	122	... code ...<br>
	123	<br>
	124	NTL_EXEC_INDEX_END<br>
	125	<br>
	126	<font color="#0000ed"><i>// will execute the body on nt different threads, each with a unique index in</i></font><br>
	127	<font color="#0000ed"><i>// the range [0..nt).  A variable named "index" (or whatever name you specify)</i></font><br>
	128	<font color="#0000ed"><i>// of type long will hold the given index.</i></font><br>
	129	<font color="#0000ed"><i>// </i></font><br>
	130	<font color="#0000ed"><i>// This tool is useful if you need to manage memory a bit more carefully.  For</i></font><br>
	131	<font color="#0000ed"><i>// example, the following code will compute an inner product using all</i></font><br>
	132	<font color="#0000ed"><i>// available threads:</i></font><br>
	133	<br>
	134	ZZ InnerProd(<font color="#008b00"><b>const</b></font> ZZ a, <font color="#008b00"><b>const</b></font> ZZ b, <font color="#008b00"><b>long</b></font> n) <br>
	135	{<br>
	136	PartitionInfo pinfo(n);<br>
	137	<br>
	138	<font color="#008b00"><b>long</b></font> cnt = pinfo.NumIntervals();<br>
	139	<br>
	140	Vec<ZZ> acc;<br>
	141	acc.SetLength(cnt);<br>
	142	<br>
	143	NTL_EXEC_INDEX(cnt, index)<br>
	144	<br>
	145	<font color="#008b00"><b>long</b></font> first, last;<br>
	146	pinfo.interval(first, last, index);<br>
	147	<br>
	148	ZZ& sum = acc[index];<br>
	149	sum = <font color="#ff8b00">0</font>;<br>
	150	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++) <br>
	151	MulAddTo(sum, a[i], b[i]);<br>
	152	<br>
	153	NTL_EXEC_INDEX_END<br>
	154	<br>
	155	ZZ sum;<br>
	156	sum = <font color="#ff8b00">0</font>;<br>
	157	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = <font color="#ff8b00">0</font>; i < cnt; i++)<br>
	158	sum += acc[i];<br>
	159	<br>
	160	<font color="#b02f60"><b>return</b></font> sum;<br>
	161	}<br>
	162	<br>
	163	<font color="#0000ed"><i>// This example also illustrates the class PartitionInfo, which is useful for</i></font><br>
	164	<font color="#0000ed"><i>// partitioning a large interval into smaller intervals (it is used internally</i></font><br>
	165	<font color="#0000ed"><i>// by EXEC_RANGE).  The constructor takes a single argument (in this example n)</i></font><br>
	166	<font color="#0000ed"><i>// and computes a partition of [0..n) into nearly equally sized subintervals.</i></font><br>
	167	<font color="#0000ed"><i>// The method NumIntervals() returns the number of subintervals, and the method</i></font><br>
	168	<font color="#0000ed"><i>// interval(first, last, index) sets first and last according to the endpoints</i></font><br>
	169	<font color="#0000ed"><i>// of the subinterval [first..last) with the given index.</i></font><br>
	170	<font color="#0000ed"><i>// </i></font><br>
	171	<font color="#0000ed"><i>// So in this example, cnt threads will run, each accumulating a sum into a</i></font><br>
	172	<font color="#0000ed"><i>// corresponding element of the vector acc, and afterwords, these elements are</i></font><br>
	173	<font color="#0000ed"><i>// summed.</i></font><br>
	174	<font color="#0000ed"><i>// </i></font><br>
	175	<font color="#0000ed"><i>// Note that if threads are not enabled or otherwise unavailable, the above</i></font><br>
	176	<font color="#0000ed"><i>// code will compile and run correctly (just using one thread).</i></font><br>
	177	<font color="#0000ed"><i>// </i></font><br>
	178	<font color="#0000ed"><i>// Finally, there is a "guarded" version of NTL_EXEC_RANGE called</i></font><br>
	179	<font color="#0000ed"><i>// NTL_GEXEC_RANGE.  This allows one to dynamically "guard" against parallel</i></font><br>
	180	<font color="#0000ed"><i>// execution. For example, on very small problems the runtime overhead of a</i></font><br>
	181	<font color="#0000ed"><i>// parallel for loop may not be worthwhile, or in other situations parallel</i></font><br>
	182	<font color="#0000ed"><i>// execution could cause incorrect behavior.  See below for details.</i></font><br>
	183	<br>
	184	<br>
	185	<font color="#0000ed"><i>// ************************ Thread Pools ****************************</i></font><br>
	186	<font color="#0000ed"><i>// </i></font><br>
	187	<font color="#0000ed"><i>// The above facilities are built on top of a more general thread pool class,</i></font><br>
	188	<font color="#0000ed"><i>// which you may use for your own purposes.</i></font><br>
	189	<font color="#0000ed"><i>//    </i></font><br>
	190	<font color="#0000ed"><i>// You create a thread pool by constructing a BasicThreadPool object.  For</i></font><br>
	191	<font color="#0000ed"><i>// example:</i></font><br>
	192	<br>
	193	<font color="#008b00"><b>long</b></font> nthreads = <font color="#ff8b00">4</font>;<br>
	194	BasicThreadPool pool(nthreads);<br>
	195	<br>
	196	<font color="#0000ed"><i>// creates a thread pool of 4 threads.  These threads will exist until the</i></font><br>
	197	<font color="#0000ed"><i>// destructor for pool is called.  </i></font><br>
	198	<font color="#0000ed"><i>// </i></font><br>
	199	<font color="#0000ed"><i>// The simplest way to use a thread pools is as follows.  Suppose you have a</i></font><br>
	200	<font color="#0000ed"><i>// task that consists of sz subtasks, indexed 0..sz-1.  Then you can write:</i></font><br>
	201	<br>
	202	pool.exec_range(sz, <br>
	203	[&](<font color="#008b00"><b>long</b></font> first, <font color="#008b00"><b>long</b></font> last) {<br>
	204	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++) {<br>
	205	... code to process subtask i ...<br>
	206	}<br>
	207	}<br>
	208	);<br>
	209	<br>
	210	<font color="#0000ed"><i>// The second argument to exec_range is a C++11 "lambda".  The "[&]" indicates</i></font><br>
	211	<font color="#0000ed"><i>// that all local variables in the calling context are captured by reference,</i></font><br>
	212	<font color="#0000ed"><i>// so the lambda body can reference all visible local variables directly.</i></font><br>
	213	<font color="#0000ed"><i>// C++11 provides other methods for capturing local variables.  The interval</i></font><br>
	214	<font color="#0000ed"><i>// [0..sz) is partitioned into subintervals of the form [first..last), which</i></font><br>
	215	<font color="#0000ed"><i>// are processed by the code in the supplied lambda.</i></font><br>
	216	<font color="#0000ed"><i>// </i></font><br>
	217	<font color="#0000ed"><i>// A lower-level interface is also provided.  One can write:</i></font><br>
	218	<br>
	219	pool.exec_index(cnt,<br>
	220	[&](<font color="#008b00"><b>long</b></font> index) {<br>
	221	... code to process index i ...<br>
	222	}<br>
	223	);<br>
	224	<br>
	225	<font color="#0000ed"><i>// This will activate exactly cnt threads with indices 0..cnt-1, and execute</i></font><br>
	226	<font color="#0000ed"><i>// the given code on each index.  The parameter cnt must not exceed nthreads,</i></font><br>
	227	<font color="#0000ed"><i>// otherwise an error is raised.</i></font><br>
	228	<br>
	229	<br>
	230	<font color="#0000ed"><i>// ====================================================================</i></font><br>
	231	<font color="#0000ed"><i>// </i></font><br>
	232	<font color="#0000ed"><i>// NOTES:</i></font><br>
	233	<font color="#0000ed"><i>// </i></font><br>
	234	<font color="#0000ed"><i>// When one activates a thread pool with nthreads threads, the current thread</i></font><br>
	235	<font color="#0000ed"><i>// (the one activating the pool) will also participate in the computation.</i></font><br>
	236	<font color="#0000ed"><i>// This means that the thread pool only contains nthreads-1 other threads.</i></font><br>
	237	<font color="#0000ed"><i>// </i></font><br>
	238	<font color="#0000ed"><i>// If, during an activation, any thread throws an exception, it will be caught</i></font><br>
	239	<font color="#0000ed"><i>// and rethrown in the activating thread when all the threads complete.  If</i></font><br>
	240	<font color="#0000ed"><i>// more than one thread throws an exception, the first one that is caught is</i></font><br>
	241	<font color="#0000ed"><i>// the one that is rethrown.</i></font><br>
	242	<font color="#0000ed"><i>// </i></font><br>
	243	<font color="#0000ed"><i>// Methods are also provided for adding, deleting, and moving threads in and</i></font><br>
	244	<font color="#0000ed"><i>// among thread pools.</i></font><br>
	245	<font color="#0000ed"><i>// </i></font><br>
	246	<font color="#0000ed"><i>// If NTL_THREADS=off, the corresponding header file may be included, but the</i></font><br>
	247	<font color="#0000ed"><i>// BasicThreadPool class is not defined.</i></font><br>
	248	<font color="#0000ed"><i>//</i></font><br>
	249	<font color="#0000ed"><i>// Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence</i></font><br>
	250	<font color="#0000ed"><i>// cannot be used in a Vec.  One should first wrap it in a pointer class, such</i></font><br>
	251	<font color="#0000ed"><i>// as UniquePtr.</i></font><br>
	252	<br>
	253	<br>
	254	<br>
	255	<font color="#0000ed"><i>// class BasicThreadPool: provided basic functionality for thread pools</i></font><br>
	256	<br>
	257	<font color="#008b00"><b>class</b></font> BasicThreadPool {<br>
	258	<font color="#b02f60"><b>private</b></font>:<br>
	259	<br>
	260	BasicThreadPool(<font color="#008b00"><b>const</b></font> BasicThreadPool&); <font color="#0000ed"><i>// disabled</i></font><br>
	261	<font color="#008b00"><b>void</b></font> <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font> BasicThreadPool&); <font color="#0000ed"><i>// disabled</i></font><br>
	262	<br>
	263	<font color="#b02f60"><b>public</b></font>:<br>
	264	<br>
	265	<font color="#008b00"><b>explicit</b></font><br>
	266	BasicThreadPool(<font color="#008b00"><b>long</b></font> nthreads);<br>
	267	<font color="#0000ed"><i>// creates a pool with nthreads threads, including the current thread</i></font><br>
	268	<font color="#0000ed"><i>// (so nthreads-1 other threads get created)</i></font><br>
	269	<br>
	270	<font color="#008b00"><b>template</b></font><<font color="#008b00"><b>class</b></font> Fct><br>
	271	<font color="#008b00"><b>void</b></font> exec_range(<font color="#008b00"><b>long</b></font> sz, <font color="#008b00"><b>const</b></font> Fct& fct); <br>
	272	<font color="#0000ed"><i>// activate by range (see example usage above)</i></font><br>
	273	<br>
	274	<font color="#008b00"><b>template</b></font><<font color="#008b00"><b>class</b></font> Fct><br>
	275	<font color="#008b00"><b>void</b></font> exec_index(<font color="#008b00"><b>long</b></font> cnt, <font color="#008b00"><b>const</b></font> Fct& fct); <br>
	276	<font color="#0000ed"><i>// activate by index (see example usage above)</i></font><br>
	277	<br>
	278	<font color="#008b00"><b>void</b></font> add(<font color="#008b00"><b>long</b></font> n = <font color="#ff8b00">1</font>);<br>
	279	<font color="#0000ed"><i>// add n threads to the pool</i></font><br>
	280	<br>
	281	<font color="#008b00"><b>long</b></font> NumThreads() <font color="#008b00"><b>const</b></font>;<br>
	282	<font color="#0000ed"><i>// return number of threads (including current thread)</i></font><br>
	283	<br>
	284	<font color="#008b00"><b>void</b></font> remove(<font color="#008b00"><b>long</b></font> n = <font color="#ff8b00">1</font>);<br>
	285	<font color="#0000ed"><i>// remove n threads from the pool</i></font><br>
	286	<br>
	287	<font color="#008b00"><b>void</b></font> move(BasicThreadPool& other, <font color="#008b00"><b>long</b></font> n = <font color="#ff8b00">1</font>) <br>
	288	<font color="#0000ed"><i>// move n threads from other pool to this pool</i></font><br>
	289	<br>
	290	<font color="#008b00"><b>bool</b></font> active() <font color="#008b00"><b>const</b></font>;<br>
	291	<font color="#0000ed"><i>// indicates an activation is in process: invoking any of the methods</i></font><br>
	292	<font color="#0000ed"><i>// exec_index, exec_range, add, remove, move, or the destructor</i></font><br>
	293	<font color="#0000ed"><i>// whie active will raise an error</i></font><br>
	294	<br>
	295	<font color="#008b00"><b>template</b></font><<font color="#008b00"><b>class</b></font> Fct><br>
	296	<font color="#008b00"><b>static</b></font> <font color="#008b00"><b>void</b></font> relaxed_exec_range(BasicThreadPool *pool, <font color="#008b00"><b>long</b></font> sz, <font color="#008b00"><b>const</b></font> Fct& fct);<br>
	297	<font color="#0000ed"><i>// similar to pool->exec_range(sz, fct), but will still work even </i></font><br>
	298	<font color="#0000ed"><i>// if !pool or pool->active(), using just the current thread</i></font><br>
	299	<br>
	300	<font color="#008b00"><b>template</b></font><<font color="#008b00"><b>class</b></font> Fct><br>
	301	<font color="#008b00"><b>static</b></font> <font color="#008b00"><b>void</b></font> relaxed_exec_index(BasicThreadPool *pool, <font color="#008b00"><b>long</b></font> cnt, <font color="#008b00"><b>const</b></font> Fct& fct);<br>
	302	<font color="#0000ed"><i>// similar to pool->exec_index(cnt, fct), but will still work even </i></font><br>
	303	<font color="#0000ed"><i>// if !pool or pool->active(), provided cnt <= 1, using just the current thread</i></font><br>
	304	<br>
	305	};<br>
	306	<br>
	307	<br>
	308	<br>
	309	<br>
	310	<font color="#0000ed"><i>// THREAD BOOSTING FEATURES:</i></font><br>
	311	<br>
	312	<font color="#008b00"><b>void</b></font> SetNumThreads(<font color="#008b00"><b>long</b></font> nt);<br>
	313	<font color="#0000ed"><i>// convenience routine to set NTL's thread pool.</i></font><br>
	314	<font color="#0000ed"><i>// If called more than once, the old thread pool is destroyed and</i></font><br>
	315	<font color="#0000ed"><i>// replaced by a new one.</i></font><br>
	316	<font color="#0000ed"><i>// If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.</i></font><br>
	317	<br>
	318	<font color="#008b00"><b>long</b></font> AvailableThreads();<br>
	319	<font color="#0000ed"><i>// Number of threads currently availble to use in NTL's thread pool.  This is</i></font><br>
	320	<font color="#0000ed"><i>// always at least 1 (for the current thread).  </i></font><br>
	321	<font color="#0000ed"><i>// If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.</i></font><br>
	322	<br>
	323	BasicThreadPool *GetThreadPool();<br>
	324	<font color="#008b00"><b>void</b></font> ResetThreadPool(BasicThreadPool *pool = <font color="#ff8b00">0</font>);<br>
	325	BasicThreadPool *ReleaseThreadPool();<br>
	326	<font color="#0000ed"><i>// Routines to get and set NTL's thread pool.  The interfaces parallel NTL's</i></font><br>
	327	<font color="#0000ed"><i>// UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored</i></font><br>
	328	<font color="#0000ed"><i>// as a UniquePtr<BasicThreadPool>.</i></font><br>
	329	<font color="#0000ed"><i>// These are only declared when NTL_THREAD_BOOST=on.  </i></font><br>
	330	<br>
	331	<br>
	332	<font color="#1773cc">#define NTL_EXEC_RANGE(sz, first, last) ...</font><br>
	333	<font color="#1773cc">#define NTL_EXEC_RANGE_END ...</font><br>
	334	<font color="#1773cc">#define NTL_EXEC_INDEX(cnt, index) ...</font><br>
	335	<font color="#1773cc">#define NTL_EXEC_INDEX_END ...</font><br>
	336	<font color="#0000ed"><i>// convenience macros to implement "parallel for loops" using NTL's thread</i></font><br>
	337	<font color="#0000ed"><i>// pool.  See examples above for usage.  If NTL_THREAD_BOOST=off, then these</i></font><br>
	338	<font color="#0000ed"><i>// are still defined, and code will run on a single thread</i></font><br>
	339	<br>
	340	<br>
	341	<font color="#1773cc">#define NTL_GEXEC_RANGE(seq, sz, first, last) ...</font><br>
	342	<font color="#1773cc">#define NTL_GEXEC_RANGE_END ...</font><br>
	343	<font color="#0000ed"><i>// "guarded" version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs</i></font><br>
	344	<font color="#0000ed"><i>// on a single thread.  This is useful in avoiding situations where the</i></font><br>
	345	<font color="#0000ed"><i>// overhead of a parallel loop is too high.  If seq evaluates to the constant</i></font><br>
	346	<font color="#0000ed"><i>// true, a good compiler will optimize code to run on a single thread, with no</i></font><br>
	347	<font color="#0000ed"><i>// overhead.</i></font><br>
	348	<br>
	349	<font color="#1773cc">#define NTL_IMPORT(x) </font><br>
	350	<font color="#0000ed"><i>// To be used in conjunction with NTL_EXEC_RANGE and friends.  When</i></font><br>
	351	<font color="#0000ed"><i>// NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing</i></font><br>
	352	<font color="#0000ed"><i>// scope to a local copy.  This should only be used for types with cheap</i></font><br>
	353	<font color="#0000ed"><i>// copies, such as scalars and pointers.  In some situations, this allows the</i></font><br>
	354	<font color="#0000ed"><i>// compiler to optimize a bit more aggressively.  One or more of these may be</i></font><br>
	355	<font color="#0000ed"><i>// placed right after an NTL_EXEC_RANGE.</i></font><br>
	356	<font color="#0000ed"><i>// When NTL_THREAD_BOOST=off, this is still defined, and does nothing.</i></font><br>
	357	<br>
	358	<br>
	359	<font color="#0000ed"><i>// class PartitionInfo: A helper class to facilitate partitioning an interval</i></font><br>
	360	<font color="#0000ed"><i>// into subintervals.  NOTE: this class is available, even when</i></font><br>
	361	<font color="#0000ed"><i>// NTL_THREAD_BOOST=off.</i></font><br>
	362	<br>
	363	<font color="#008b00"><b>class</b></font> PartitionInfo {<br>
	364	<font color="#b02f60"><b>public</b></font>:<br>
	365	<br>
	366	<font color="#008b00"><b>explicit</b></font><br>
	367	PartitionInfo(<font color="#008b00"><b>long</b></font> sz, <font color="#008b00"><b>long</b></font> nt = AvailableThreads()); <br>
	368	<font color="#0000ed"><i>// partitions [0..sz) into at most nt subintervals.  sz may be 0 or</i></font><br>
	369	<font color="#0000ed"><i>// negative, in which case the number of subintervals is 0.</i></font><br>
	370	<br>
	371	<font color="#008b00"><b>long</b></font> NumIntervals() <font color="#008b00"><b>const</b></font>;<br>
	372	<font color="#0000ed"><i>// return the number of subintervals</i></font><br>
	373	<br>
	374	<font color="#008b00"><b>void</b></font> interval(<font color="#008b00"><b>long</b></font>& first, <font color="#008b00"><b>long</b></font>& last, <font color="#008b00"><b>long</b></font> i) <font color="#008b00"><b>const</b></font>;<br>
	375	<font color="#0000ed"><i>// [first..last) is the ith interval, where i in [0..NumInvervals()).  No</i></font><br>
	376	<font color="#0000ed"><i>// range checking is performed.</i></font><br>
	377	<br>
	378	};<br>
	379	<br>
	380	<br>
	381	<br>
	382	</font></body>
	383	</html>

+374

-0

doc/BasicThreadPool.txt less more

	0
	1
	2	/************************************************************************
	3
	4	MODULE: BasicThreadPool
	5
	6	SUMMARY:
	7
	8	A simple thread pool class BasicThreadPool, as well as some higher-level macros
	9	which facilitite simple parallel for loops.
	10
	11
	12	***************************************************************************/
	13
	14
	15	// ******************** Simple parallel for loops ************************
	16	//
	17	// We begin with a description of the higher-level macros for writing simple
	18	// parallel for loops. These facilitaties are activated only when NTL is
	19	// configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).
	20	// However, code that uses these facilties should still compile and run
	21	// correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so
	22	// this is the simplest way to write parallel for loops across a range of
	23	// compile-time and run-time environments. Note that if NTL_THREADS=on, C++11
	24	// features are reqired, but when NTL_THREADS=off, these features are not
	25	// required, so the code should compile on older C++ compilers.
	26	//
	27	// Here is a simple recipe for writing parallel for loop.
	28	//
	29	// At the start of program execution, your program should execute
	30
	31	SetNumThreads(nt);
	32
	33	// You can choose nt to be any positive integer, but for best results, it
	34	// should correspond to the number of available cores on your machine.
	35	// [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does
	36	// nothing.]
	37	//
	38	// Now consider the following routine:
	39
	40	void mul(ZZ x, const ZZ a, const ZZ *b, long n)
	41	{
	42	for (long i = 0; i < n; i++)
	43	mul(x[i], a[i], b[i]);
	44	}
	45
	46	// We can parallelize it as follows:
	47
	48	void mul(ZZ x, const ZZ a, const ZZ *b, long n)
	49	{
	50	NTL_EXEC_RANGE(n, first, last)
	51
	52	for (long i = first; i < last; i++)
	53	mul(x[i], a[i], b[i]);
	54
	55	NTL_EXEC_RANGE_END
	56	}
	57
	58	// NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just "do the right
	59	// thing". If there are nt threads available, the interval [0..n) will be
	60	// partitioned into (up to) nt subintervals, and a different thread will be
	61	// used to process each subinterval. You still have to write the for loop
	62	// yourself: the macro just declares and initializes variables "first" and
	63	// "last" (or whatever you want to call them) of type long that represent the
	64	// subinterval [first..last) to be processed by one thread.
	65	//
	66	// Note that the current thread participates as one of the nt available
	67	// threads, and that the current thread will wait for all participating threads
	68	// to finish their task before proceeding.
	69	//
	70	// Withing the "body" of this construct, you can freely reference any variables
	71	// that are visible at this point. This is implemented using the C++ lambda
	72	// feature (capturing all variables by reference).
	73	//
	74	// This construct will still work even if threads are disabled, in which case
	75	// it runs single-threaded with first=0 and last=n.
	76	//
	77	// Note that the code within the EXEC_RANGE body could call other routines that
	78	// themselves attempt to execute an EXEC_RANGE: if this happens, the latter
	79	// EXEC_RANGE will detect this and run single-threaded.
	80	//
	81	// You may wish to do other things within the EXEC_RANGE body than just execute
	82	// a loop. One thing you may want to do is to declare variables. Another
	83	// thing you may want to do is setup a local context for a ZZ_p modulus (or
	84	// other type of modulus). Here is an example of doing this:
	85
	86
	87	void mul(ZZ_p x, const ZZ_p a, const ZZ_p *b, long n)
	88	{
	89	ZZ_pContext context;
	90	context.save();
	91
	92	NTL_EXEC_RANGE(n, first, last)
	93
	94	context.restore();
	95
	96	for (long i = first; i < last; i++)
	97	mul(x[i], a[i], b[i]);
	98
	99	NTL_EXEC_RANGE_END
	100	}
	101
	102
	103	// Another useful function is AvailableThreads(), which will return the number
	104	// of available threads. If threads or thread boosting is not enabled, this
	105	// will return 1. Even if thread boosting is enabled, this may return 1 if for
	106	// whatever reason, the thread pool is not available for use (for example,
	107	// SetNumThreads was never called, or the thread pool is already active).
	108	//
	109	// A lower-level set of tools is available, which allow you to simply run a
	110	// specified number of threads. Assuming nt <= AvailableThreads(), the code
	111
	112	NTL_EXEC_INDEX(nt, index)
	113
	114	... code ...
	115
	116	NTL_EXEC_INDEX_END
	117
	118	// will execute the body on nt different threads, each with a unique index in
	119	// the range [0..nt). A variable named "index" (or whatever name you specify)
	120	// of type long will hold the given index.
	121	//
	122	// This tool is useful if you need to manage memory a bit more carefully. For
	123	// example, the following code will compute an inner product using all
	124	// available threads:
	125
	126	ZZ InnerProd(const ZZ a, const ZZ b, long n)
	127	{
	128	PartitionInfo pinfo(n);
	129
	130	long cnt = pinfo.NumIntervals();
	131
	132	Vec<ZZ> acc;
	133	acc.SetLength(cnt);
	134
	135	NTL_EXEC_INDEX(cnt, index)
	136
	137	long first, last;
	138	pinfo.interval(first, last, index);
	139
	140	ZZ& sum = acc[index];
	141	sum = 0;
	142	for (long i = first; i < last; i++)
	143	MulAddTo(sum, a[i], b[i]);
	144
	145	NTL_EXEC_INDEX_END
	146
	147	ZZ sum;
	148	sum = 0;
	149	for (long i = 0; i < cnt; i++)
	150	sum += acc[i];
	151
	152	return sum;
	153	}
	154
	155	// This example also illustrates the class PartitionInfo, which is useful for
	156	// partitioning a large interval into smaller intervals (it is used internally
	157	// by EXEC_RANGE). The constructor takes a single argument (in this example n)
	158	// and computes a partition of [0..n) into nearly equally sized subintervals.
	159	// The method NumIntervals() returns the number of subintervals, and the method
	160	// interval(first, last, index) sets first and last according to the endpoints
	161	// of the subinterval [first..last) with the given index.
	162	//
	163	// So in this example, cnt threads will run, each accumulating a sum into a
	164	// corresponding element of the vector acc, and afterwords, these elements are
	165	// summed.
	166	//
	167	// Note that if threads are not enabled or otherwise unavailable, the above
	168	// code will compile and run correctly (just using one thread).
	169	//
	170	// Finally, there is a "guarded" version of NTL_EXEC_RANGE called
	171	// NTL_GEXEC_RANGE. This allows one to dynamically "guard" against parallel
	172	// execution. For example, on very small problems the runtime overhead of a
	173	// parallel for loop may not be worthwhile, or in other situations parallel
	174	// execution could cause incorrect behavior. See below for details.
	175
	176
	177	// ************************ Thread Pools ****************************
	178	//
	179	// The above facilities are built on top of a more general thread pool class,
	180	// which you may use for your own purposes.
	181	//
	182	// You create a thread pool by constructing a BasicThreadPool object. For
	183	// example:
	184
	185	long nthreads = 4;
	186	BasicThreadPool pool(nthreads);
	187
	188	// creates a thread pool of 4 threads. These threads will exist until the
	189	// destructor for pool is called.
	190	//
	191	// The simplest way to use a thread pools is as follows. Suppose you have a
	192	// task that consists of sz subtasks, indexed 0..sz-1. Then you can write:
	193
	194	pool.exec_range(sz,
	195	[&](long first, long last) {
	196	for (long i = first; i < last; i++) {
	197	... code to process subtask i ...
	198	}
	199	}
	200	);
	201
	202	// The second argument to exec_range is a C++11 "lambda". The "[&]" indicates
	203	// that all local variables in the calling context are captured by reference,
	204	// so the lambda body can reference all visible local variables directly.
	205	// C++11 provides other methods for capturing local variables. The interval
	206	// [0..sz) is partitioned into subintervals of the form [first..last), which
	207	// are processed by the code in the supplied lambda.
	208	//
	209	// A lower-level interface is also provided. One can write:
	210
	211	pool.exec_index(cnt,
	212	[&](long index) {
	213	... code to process index i ...
	214	}
	215	);
	216
	217	// This will activate exactly cnt threads with indices 0..cnt-1, and execute
	218	// the given code on each index. The parameter cnt must not exceed nthreads,
	219	// otherwise an error is raised.
	220
	221
	222	// ====================================================================
	223	//
	224	// NOTES:
	225	//
	226	// When one activates a thread pool with nthreads threads, the current thread
	227	// (the one activating the pool) will also participate in the computation.
	228	// This means that the thread pool only contains nthreads-1 other threads.
	229	//
	230	// If, during an activation, any thread throws an exception, it will be caught
	231	// and rethrown in the activating thread when all the threads complete. If
	232	// more than one thread throws an exception, the first one that is caught is
	233	// the one that is rethrown.
	234	//
	235	// Methods are also provided for adding, deleting, and moving threads in and
	236	// among thread pools.
	237	//
	238	// If NTL_THREADS=off, the corresponding header file may be included, but the
	239	// BasicThreadPool class is not defined.
	240	//
	241	// Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence
	242	// cannot be used in a Vec. One should first wrap it in a pointer class, such
	243	// as UniquePtr.
	244
	245
	246
	247	// class BasicThreadPool: provided basic functionality for thread pools
	248
	249	class BasicThreadPool {
	250	private:
	251
	252	BasicThreadPool(const BasicThreadPool&); // disabled
	253	void operator=(const BasicThreadPool&); // disabled
	254
	255	public:
	256
	257	explicit
	258	BasicThreadPool(long nthreads);
	259	// creates a pool with nthreads threads, including the current thread
	260	// (so nthreads-1 other threads get created)
	261
	262	template<class Fct>
	263	void exec_range(long sz, const Fct& fct);
	264	// activate by range (see example usage above)
	265
	266	template<class Fct>
	267	void exec_index(long cnt, const Fct& fct);
	268	// activate by index (see example usage above)
	269
	270	void add(long n = 1);
	271	// add n threads to the pool
	272
	273	long NumThreads() const;
	274	// return number of threads (including current thread)
	275
	276	void remove(long n = 1);
	277	// remove n threads from the pool
	278
	279	void move(BasicThreadPool& other, long n = 1)
	280	// move n threads from other pool to this pool
	281
	282	bool active() const;
	283	// indicates an activation is in process: invoking any of the methods
	284	// exec_index, exec_range, add, remove, move, or the destructor
	285	// whie active will raise an error
	286
	287	template<class Fct>
	288	static void relaxed_exec_range(BasicThreadPool *pool, long sz, const Fct& fct);
	289	// similar to pool->exec_range(sz, fct), but will still work even
	290	// if !pool or pool->active(), using just the current thread
	291
	292	template<class Fct>
	293	static void relaxed_exec_index(BasicThreadPool *pool, long cnt, const Fct& fct);
	294	// similar to pool->exec_index(cnt, fct), but will still work even
	295	// if !pool or pool->active(), provided cnt <= 1, using just the current thread
	296
	297	};
	298
	299
	300
	301
	302	// THREAD BOOSTING FEATURES:
	303
	304	void SetNumThreads(long nt);
	305	// convenience routine to set NTL's thread pool.
	306	// If called more than once, the old thread pool is destroyed and
	307	// replaced by a new one.
	308	// If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.
	309
	310	long AvailableThreads();
	311	// Number of threads currently availble to use in NTL's thread pool. This is
	312	// always at least 1 (for the current thread).
	313	// If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.
	314
	315	BasicThreadPool *GetThreadPool();
	316	void ResetThreadPool(BasicThreadPool *pool = 0);
	317	BasicThreadPool *ReleaseThreadPool();
	318	// Routines to get and set NTL's thread pool. The interfaces parallel NTL's
	319	// UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored
	320	// as a UniquePtr<BasicThreadPool>.
	321	// These are only declared when NTL_THREAD_BOOST=on.
	322
	323
	324	#define NTL_EXEC_RANGE(sz, first, last) ...
	325	#define NTL_EXEC_RANGE_END ...
	326	#define NTL_EXEC_INDEX(cnt, index) ...
	327	#define NTL_EXEC_INDEX_END ...
	328	// convenience macros to implement "parallel for loops" using NTL's thread
	329	// pool. See examples above for usage. If NTL_THREAD_BOOST=off, then these
	330	// are still defined, and code will run on a single thread
	331
	332
	333	#define NTL_GEXEC_RANGE(seq, sz, first, last) ...
	334	#define NTL_GEXEC_RANGE_END ...
	335	// "guarded" version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs
	336	// on a single thread. This is useful in avoiding situations where the
	337	// overhead of a parallel loop is too high. If seq evaluates to the constant
	338	// true, a good compiler will optimize code to run on a single thread, with no
	339	// overhead.
	340
	341	#define NTL_IMPORT(x)
	342	// To be used in conjunction with NTL_EXEC_RANGE and friends. When
	343	// NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing
	344	// scope to a local copy. This should only be used for types with cheap
	345	// copies, such as scalars and pointers. In some situations, this allows the
	346	// compiler to optimize a bit more aggressively. One or more of these may be
	347	// placed right after an NTL_EXEC_RANGE.
	348	// When NTL_THREAD_BOOST=off, this is still defined, and does nothing.
	349
	350
	351	// class PartitionInfo: A helper class to facilitate partitioning an interval
	352	// into subintervals. NOTE: this class is available, even when
	353	// NTL_THREAD_BOOST=off.
	354
	355	class PartitionInfo {
	356	public:
	357
	358	explicit
	359	PartitionInfo(long sz, long nt = AvailableThreads());
	360	// partitions [0..sz) into at most nt subintervals. sz may be 0 or
	361	// negative, in which case the number of subintervals is 0.
	362
	363	long NumIntervals() const;
	364	// return the number of subintervals
	365
	366	void interval(long& first, long& last, long i) const;
	367	// [first..last) is the ith interval, where i in [0..NumInvervals()). No
	368	// range checking is performed.
	369
	370	};
	371
	372
	373

-1

doc/GF2.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/GF2E.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2E.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2E.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/GF2EX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2EX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2EX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-2

doc/GF2EXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2EXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2EXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

85	85	<font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
86	86	<font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
87	87	<font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
88		<font color="#0000ed"><i>// form ddf--baby- and ddf--giant-.</i></font><br>
	88	<font color="#0000ed"><i>// form tmp-*.</i></font><br>
89	89	<font color="#0000ed"><i>// The definition of "large" is controlled by the variable</i></font><br>
90	90	<br>
91	91	<font color="#008b00"><b>extern</b></font> <font color="#008b00"><b>double</b></font> GF2EXFileThresh<br>

-1

doc/GF2EXFactoring.txt less more

77	77	// this routine uses external files to store some intermediate
78	78	// results, which are removed if the routine terminates normally.
79	79	// These files are stored in the current directory under names of the
80		// form ddf--baby- and ddf--giant-.
	80	// form tmp-*.
81	81	// The definition of "large" is controlled by the variable
82	82
83	83	extern double GF2EXFileThresh

-1

doc/GF2X.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2X.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2X.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/GF2XFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2XFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2XFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/GF2XVec.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/GF2XVec.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/GF2XVec.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/HNF.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/HNF.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/HNF.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/LLL.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/LLL.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/LLL.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/Lazy.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/Lazy.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/Lazy.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/LazyTable.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/LazyTable.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/LazyTable.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

+56

-1

doc/RR.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/RR.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/RR.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

25	25	<br>
26	26	<font color="#0000ed"><i>The minimum precision that can be set is 53 bits.</i></font><br>
27	27	<font color="#0000ed"><i>The maximum precision is limited only by the word size of the machine.</i></font><br>
	28	<br>
	29	<font color="#0000ed"><i>A convenience class RRPush is provided to automatically save and</i></font><br>
	30	<font color="#0000ed"><i>restore the current precision.</i></font><br>
28	31	<br>
29	32	<font color="#0000ed"><i>All arithmetic operations are implemented so that the effect is as if the</i></font><br>
30	33	<font color="#0000ed"><i>result was computed exactly, and then rounded to p bits.  If a number</i></font><br>

356	359	ZZ RoundToZZ(<font color="#008b00"><b>const</b></font> RR& a);            <font color="#0000ed"><i>// ties are rounded to an even integer</i></font><br>
357	360	<br>
358	361	<br>
	362	<br>
	363	<a name="push"></a>
	364	<br>
	365	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>
	366	<br>
	367	<font color="#0000ed"><i>                 Saving and restoring the current precision</i></font><br>
	368	<br>
	369	<font color="#0000ed"><i>\************************************************************************</i></font><font color="#0000ed"><i>/</i></font><br>
	370	<br>
	371	<br>
	372	<font color="#008b00"><b>class</b></font> RRPush {<br>
	373	<font color="#b02f60"><b>public</b></font>:<br>
	374	RRPush();  <font color="#0000ed"><i>// saves the cuurent precision</i></font><br>
	375	~RRPush(); <font color="#0000ed"><i>// restores the saved precision</i></font><br>
	376	<br>
	377	<font color="#b02f60"><b>private</b></font>: <br>
	378	RRPush(<font color="#008b00"><b>const</b></font> RRPush&); <font color="#0000ed"><i>// disable</i></font><br>
	379	<font color="#008b00"><b>void</b></font> <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font> RRPush&); <font color="#0000ed"><i>// disable</i></font><br>
	380	};<br>
	381	<br>
	382	<br>
	383	<font color="#0000ed"><i>// Example: </i></font><br>
	384	<font color="#0000ed"><i>//</i></font><br>
	385	<font color="#0000ed"><i>// {</i></font><br>
	386	<font color="#0000ed"><i>//    RRPush push;  // don't forget to declare a variable!!</i></font><br>
	387	<font color="#0000ed"><i>//    RR::SetPrecsion(new_p);</i></font><br>
	388	<font color="#0000ed"><i>//    ...</i></font><br>
	389	<font color="#0000ed"><i>// } // old precsion restored when scope is exited</i></font><br>
	390	<br>
	391	<br>
	392	<font color="#008b00"><b>class</b></font> RROutputPush {<br>
	393	<font color="#b02f60"><b>public</b></font>:<br>
	394	RROutputPush();   <font color="#0000ed"><i>// saves the cuurent output precision</i></font><br>
	395	~RROutputPush();  <font color="#0000ed"><i>// restores the saved output precision</i></font><br>
	396	<br>
	397	<font color="#b02f60"><b>private</b></font>: <br>
	398	RROutputPush(<font color="#008b00"><b>const</b></font> RROutputPush&); <font color="#0000ed"><i>// disable</i></font><br>
	399	<font color="#008b00"><b>void</b></font> <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font> RROutputPush&); <font color="#0000ed"><i>// disable</i></font><br>
	400	};<br>
	401	<br>
	402	<br>
	403	<font color="#0000ed"><i>// Example: </i></font><br>
	404	<font color="#0000ed"><i>//</i></font><br>
	405	<font color="#0000ed"><i>// {</i></font><br>
	406	<font color="#0000ed"><i>//    RROutputPush push;  // don't forget to declare a variable!!</i></font><br>
	407	<font color="#0000ed"><i>//    RR::SetOutputPrecsion(new_op);</i></font><br>
	408	<font color="#0000ed"><i>//    ...</i></font><br>
	409	<font color="#0000ed"><i>// } // old output precsion restored when scope is exited</i></font><br>
	410	<br>
	411	<br>
	412	<br>
	413	<br>
359	414	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>
360	415	<br>
361	416	<font color="#0000ed"><i>                                 Miscelaneous</i></font><br>

+55

-0

doc/RR.txt less more

17	17
18	18	The minimum precision that can be set is 53 bits.
19	19	The maximum precision is limited only by the word size of the machine.
	20
	21	A convenience class RRPush is provided to automatically save and
	22	restore the current precision.
20	23
21	24	All arithmetic operations are implemented so that the effect is as if the
22	25	result was computed exactly, and then rounded to p bits. If a number

348	351	ZZ RoundToZZ(const RR& a); // ties are rounded to an even integer
349	352
350	353
	354
	355	// @anchor{push}
	356
	357	/**************************************************************************\
	358
	359	Saving and restoring the current precision
	360
	361	\**************************************************************************/
	362
	363
	364	class RRPush {
	365	public:
	366	RRPush(); // saves the cuurent precision
	367	~RRPush(); // restores the saved precision
	368
	369	private:
	370	RRPush(const RRPush&); // disable
	371	void operator=(const RRPush&); // disable
	372	};
	373
	374
	375	// Example:
	376	//
	377	// {
	378	// RRPush push; // don't forget to declare a variable!!
	379	// RR::SetPrecsion(new_p);
	380	// ...
	381	// } // old precsion restored when scope is exited
	382
	383
	384	class RROutputPush {
	385	public:
	386	RROutputPush(); // saves the cuurent output precision
	387	~RROutputPush(); // restores the saved output precision
	388
	389	private:
	390	RROutputPush(const RROutputPush&); // disable
	391	void operator=(const RROutputPush&); // disable
	392	};
	393
	394
	395	// Example:
	396	//
	397	// {
	398	// RROutputPush push; // don't forget to declare a variable!!
	399	// RR::SetOutputPrecsion(new_op);
	400	// ...
	401	// } // old output precsion restored when scope is exited
	402
	403
	404
	405
351	406	/**************************************************************************\
352	407
353	408	Miscelaneous

+16

-1

doc/SmartPtr.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/SmartPtr.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/SmartPtr.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

405	405	<font color="#0000ed"><i>// returns raw pointer, and sets the raw pointer to null</i></font><br>
406	406	<br>
407	407	<font color="#008b00"><b>void</b></font> move(UniquePtr& other);<br>
	408	<font color="#008b00"><b>template</b></font><<font color="#008b00"><b>class</b></font> Y> <font color="#008b00"><b>void</b></font> move(UniqePtr<Y>& other);<br>
408	409	<font color="#0000ed"><i>// move other to *this</i></font><br>
	410	<font color="#0000ed"><i>// in the second version, Y* should be convertable to T*</i></font><br>
	411	<font color="#0000ed"><i>// NOTE: if Y is a subclass of T, then typically, ~T() should</i></font><br>
	412	<font color="#0000ed"><i>// be virtual, to ensure that the destructor for Y is called</i></font><br>
409	413	<br>
410	414	<font color="#008b00"><b>void</b></font> swap(UniquePtr& other);<br>
411	415	<font color="#0000ed"><i>// swap raw pointers</i></font><br>

463	467	<br>
464	468	<font color="#0000ed"><i>   p1.val()             // dereference</i></font><br>
465	469	<br>
	470	<font color="#0000ed"><i>   rp = p1.get();       // fetch raw pointer</i></font><br>
	471	<font color="#0000ed"><i>   rp = p1.release();   // fetch raw pointer, and set to NULL</i></font><br>
466	472	<font color="#0000ed"><i>   p1.move(p2);         // move p2 to p1, destroying p1's referent</i></font><br>
467	473	<font color="#0000ed"><i>                        //   if p1 != p2</i></font><br>
468	474	<br>

510	516	<font color="#008b00"><b>bool</b></font> exists() <font color="#008b00"><b>const</b></font>;<br>
511	517	<font color="#0000ed"><i>// checks that underlying pointer is not null</i></font><br>
512	518	<br>
	519	T* get() <font color="#008b00"><b>const</b></font>;<br>
	520	<font color="#0000ed"><i>// returns underlying raw pointer</i></font><br>
	521	<br>
	522	T* release();<br>
	523	<font color="#0000ed"><i>// returns raw pointer, and sets the raw pointer to null</i></font><br>
	524	<br>
513	525	<font color="#008b00"><b>void</b></font> move(OptionalVal& other);<br>
514	526	<font color="#0000ed"><i>// performs a (shallow) pointer move</i></font><br>
515	527	<br>

595	607	<br>
596	608	T* get() <font color="#008b00"><b>const</b></font>;<br>
597	609	<font color="#0000ed"><i>// get raw pointer</i></font><br>
	610	<br>
	611	T* elts() <font color="#008b00"><b>const</b></font>;<br>
	612	<font color="#0000ed"><i>// get raw pointer (for compatibility with the Vec class)</i></font><br>
598	613	<br>
599	614	T* release();<br>
600	615	<font color="#0000ed"><i>// get raw pointer and reset to null</i></font><br>

+15

-0

doc/SmartPtr.txt less more

397	397	// returns raw pointer, and sets the raw pointer to null
398	398
399	399	void move(UniquePtr& other);
	400	template<class Y> void move(UniqePtr<Y>& other);
400	401	// move other to *this
	402	// in the second version, Y* should be convertable to T*
	403	// NOTE: if Y is a subclass of T, then typically, ~T() should
	404	// be virtual, to ensure that the destructor for Y is called
401	405
402	406	void swap(UniquePtr& other);
403	407	// swap raw pointers

455	459
456	460	p1.val() // dereference
457	461
	462	rp = p1.get(); // fetch raw pointer
	463	rp = p1.release(); // fetch raw pointer, and set to NULL
458	464	p1.move(p2); // move p2 to p1, destroying p1's referent
459	465	// if p1 != p2
460	466

502	508	bool exists() const;
503	509	// checks that underlying pointer is not null
504	510
	511	T* get() const;
	512	// returns underlying raw pointer
	513
	514	T* release();
	515	// returns raw pointer, and sets the raw pointer to null
	516
505	517	void move(OptionalVal& other);
506	518	// performs a (shallow) pointer move
507	519

587	599
588	600	T* get() const;
589	601	// get raw pointer
	602
	603	T* elts() const;
	604	// get raw pointer (for compatibility with the Vec class)
590	605
591	606	T* release();
592	607	// get raw pointer and reset to null

+90

-14

doc/ZZ.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

326	326	};<br>
327	327	<br>
328	328	<font color="#008b00"><b>long</b></font> InvModStatus(ZZ& x, <font color="#008b00"><b>const</b></font> ZZ& a, <font color="#008b00"><b>const</b></font> ZZ& n);<br>
329		<font color="#0000ed"><i>// if gcd(a,b) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
	329	<font color="#0000ed"><i>// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
330	330	<font color="#0000ed"><i>// otherwise, return-value = 1, x = gcd(a, n)</i></font><br>
331	331	<br>
332	332	<font color="#008b00"><b>void</b></font> PowerMod(ZZ& x, <font color="#008b00"><b>const</b></font> ZZ& a, <font color="#008b00"><b>const</b></font> ZZ& e, <font color="#008b00"><b>const</b></font> ZZ& n);<br>

400	400	<br>
401	401	<font color="#008b00"><b>long</b></font> InvMod(<font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>long</b></font> n);<br>
402	402	<font color="#0000ed"><i>// computes a^{-1} mod n.  Error is raised if undefined.</i></font><br>
	403	<br>
	404	<font color="#008b00"><b>long</b></font> InvModStatus(<font color="#008b00"><b>long</b></font>& x, <font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>long</b></font> n);<br>
	405	<font color="#0000ed"><i>// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;</i></font><br>
	406	<font color="#0000ed"><i>// otherwise, return-value = 1, x = gcd(a, n)</i></font><br>
403	407	<br>
404	408	<font color="#008b00"><b>long</b></font> PowerMod(<font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>long</b></font> e, <font color="#008b00"><b>long</b></font> n);<br>
405	409	<font color="#0000ed"><i>// computes a^e mod n (e may be negative)</i></font><br>

654	658	<font color="#0000ed"><i>// NumBytes(0) == 0.</i></font><br>
655	659	<br>
656	660	<br>
	661	<a name="prg"></a>
657	662	<br>
658	663	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>
659	664	<br>

673	678	<font color="#0000ed"><i>// seen by a client program.</i></font><br>
674	679	<br>
675	680	<br>
676		<font color="#008b00"><b>void</b></font> SetSeed(<font color="#008b00"><b>const</b></font> ZZ& s); <br>
677		<font color="#0000ed"><i>// Initializes generator with a "seed" s.</i></font><br>
678		<font color="#0000ed"><i>// s is first hashed to generate the initial state, so it is</i></font><br>
679		<font color="#0000ed"><i>// not necessary that s itself looks random, just that </i></font><br>
680		<font color="#0000ed"><i>// it has a lot of "entropy".</i></font><br>
681		<font color="#0000ed"><i>// If SetSeed is not called before using the routines below,</i></font><br>
682		<font color="#0000ed"><i>// a default initial seed is used.</i></font><br>
683		<font color="#0000ed"><i>// This default seed is guaranteed to be unique among different</i></font><br>
684		<font color="#0000ed"><i>// threads in a given process, and an attempt is made to </i></font><br>
685		<font color="#0000ed"><i>// make this seed globally unique among all threads and processes.</i></font><br>
686		<font color="#0000ed"><i>// Routine ZZFromBytes (above) may be useful for constructing seeds</i></font><br>
687		<font color="#0000ed"><i>// from arbitrary binary data.</i></font><br>
	681	<font color="#008b00"><b>void</b></font> SetSeed(<font color="#008b00"><b>const</b></font> ZZ& s);<br>
	682	<font color="#008b00"><b>void</b></font> SetSeed(<font color="#008b00"><b>const</b></font> <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>char</b></font> *data, <font color="#008b00"><b>long</b></font> dlen);<br>
	683	<font color="#008b00"><b>void</b></font> SetSeed(<font color="#008b00"><b>const</b></font> RandomStream& s);<br>
	684	<font color="#0000ed"><i>// Initializes generator with a "seed".</i></font><br>
	685	<br>
	686	<font color="#0000ed"><i>// The first version hashes the binary representation of s to obtain a key for</i></font><br>
	687	<font color="#0000ed"><i>// a low-level RandomStream object (see below).</i></font><br>
	688	<br>
	689	<font color="#0000ed"><i>// The second version does the same, hashing the first dlen bytes pointed to by</i></font><br>
	690	<font color="#0000ed"><i>// data to obtain a key for the RandomStream object.</i></font><br>
	691	<br>
	692	<font color="#0000ed"><i>// The third version initializes the PRG state directly with the given</i></font><br>
	693	<font color="#0000ed"><i>// RandomStream object.</i></font><br>
	694	<br>
	695	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
688	696	<br>
689	697	<br>
690	698	<font color="#008b00"><b>void</b></font> RandomBnd(ZZ& x, <font color="#008b00"><b>const</b></font> ZZ& n);<br>
691	699	ZZ RandomBnd(<font color="#008b00"><b>const</b></font> ZZ& n);<br>
	700	<font color="#008b00"><b>void</b></font> RandomBnd(<font color="#008b00"><b>long</b></font>& x, <font color="#008b00"><b>long</b></font> n);<br>
692	701	<font color="#008b00"><b>long</b></font> RandomBnd(<font color="#008b00"><b>long</b></font> n);<br>
693	702	<font color="#0000ed"><i>// x = pseudo-random number in the range 0..n-1, or 0 if n <= 0</i></font><br>
	703	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
694	704	<br>
695	705	<font color="#008b00"><b>void</b></font> RandomBits(ZZ& x, <font color="#008b00"><b>long</b></font> l);<br>
696	706	ZZ RandomBits_ZZ(<font color="#008b00"><b>long</b></font> l);<br>
	707	<font color="#008b00"><b>void</b></font> RandomBits(<font color="#008b00"><b>long</b></font>& x, <font color="#008b00"><b>long</b></font> l);<br>
697	708	<font color="#008b00"><b>long</b></font> RandomBits_long(<font color="#008b00"><b>long</b></font> l);<br>
698	709	<font color="#0000ed"><i>// x = pseudo-random number in the range 0..2^l-1.</i></font><br>
	710	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
699	711	<br>
700	712	<font color="#008b00"><b>void</b></font> RandomLen(ZZ& x, <font color="#008b00"><b>long</b></font> l);<br>
701	713	ZZ RandomLen_ZZ(<font color="#008b00"><b>long</b></font> l);<br>
	714	<font color="#008b00"><b>void</b></font> RandomLen(<font color="#008b00"><b>long</b></font>& x, <font color="#008b00"><b>long</b></font> l);<br>
702	715	<font color="#008b00"><b>long</b></font> RandomLen_long(<font color="#008b00"><b>long</b></font> l);<br>
703	716	<font color="#0000ed"><i>// x = psuedo-random number with precisely l bits,</i></font><br>
704	717	<font color="#0000ed"><i>// or 0 of l <= 0.</i></font><br>
	718	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
705	719	<br>
706	720	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> RandomBits_ulong(<font color="#008b00"><b>long</b></font> l);<br>
707	721	<font color="#0000ed"><i>// returns a pseudo-random number in the range 0..2^l-1</i></font><br>
	722	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
708	723	<br>
709	724	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> RandomWord();<br>
710	725	<font color="#0000ed"><i>// returns a word filled with pseudo-random bits.</i></font><br>
711	726	<font color="#0000ed"><i>// Equivalent to RandomBits_ulong(NTL_BITS_PER_LONG).</i></font><br>
	727	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
	728	<br>
	729	<br>
	730	<br>
	731	<font color="#008b00"><b>class</b></font> RandomStream { <br>
	732	<font color="#0000ed"><i>// The low-level pseudo-random generator (PRG).</i></font><br>
	733	<font color="#0000ed"><i>// After initializing it with a key, one can effectively read an unbounded</i></font><br>
	734	<font color="#0000ed"><i>// stream of pseudorandom bytes</i></font><br>
	735	<br>
	736	<font color="#b02f60"><b>public</b></font>:<br>
	737	<br>
	738	<font color="#008b00"><b>explicit</b></font> RandomStream(<font color="#008b00"><b>const</b></font> <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>char</b></font> *key);<br>
	739	<font color="#0000ed"><i>// key should point to an array of NTL_PRG_KEYLEN bytes</i></font><br>
	740	<font color="#0000ed"><i>// EXCEPTIONS: nothrow</i></font><br>
	741	<br>
	742	<font color="#008b00"><b>void</b></font> get(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>char</b></font> *res, <font color="#008b00"><b>long</b></font> n); <br>
	743	<font color="#0000ed"><i>// read the next n bytes from the stream and store to location pointed to by</i></font><br>
	744	<font color="#0000ed"><i>// res</i></font><br>
	745	<font color="#0000ed"><i>// EXCEPTIONS: throws a LogicError exception if n is negative</i></font><br>
	746	<br>
	747	RandomStream(<font color="#008b00"><b>const</b></font> RandomStream&); <font color="#0000ed"><i>// default</i></font><br>
	748	RandomStream& <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font> RandomStream&); <font color="#0000ed"><i>// default</i></font><br>
	749	<font color="#0000ed"><i>// EXCEPTIONS: nothrow</i></font><br>
	750	};<br>
	751	<br>
	752	<br>
	753	RandomStream& GetCurrentRandomStream();<br>
	754	<font color="#0000ed"><i>// get reference to the current PRG state. If SetSeed has not been called, it</i></font><br>
	755	<font color="#0000ed"><i>// is called with a default value (which should be unique to each</i></font><br>
	756	<font color="#0000ed"><i>// process/thread).  NOTE: this is a reference to a thread-local object, so</i></font><br>
	757	<font color="#0000ed"><i>// different threads will use different PRG's, and by default, each will be</i></font><br>
	758	<font color="#0000ed"><i>// initialized with a unique seed.</i></font><br>
	759	<font color="#0000ed"><i>// NOTE: using this reference, you can copy the current PRG state or assign a</i></font><br>
	760	<font color="#0000ed"><i>// different value to it; however, see the helper class RandomStreamPush below,</i></font><br>
	761	<font color="#0000ed"><i>// which may be more convenient.</i></font><br>
	762	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
	763	<br>
	764	<br>
	765	<br>
	766	<font color="#008b00"><b>class</b></font> RandomStreamPush {<br>
	767	<font color="#0000ed"><i>// RAII for saving/restoring current PRG state</i></font><br>
	768	<font color="#b02f60"><b>public</b></font>:<br>
	769	RandomStreamPush();   <font color="#0000ed"><i>// save a copy of the current PRG state</i></font><br>
	770	<font color="#0000ed"><i>// EXCEPTIONS: strong ES</i></font><br>
	771	<br>
	772	~RandomStreamPush();  <font color="#0000ed"><i>// restore the saveed copy of the PRG state</i></font><br>
	773	<br>
	774	<font color="#b02f60"><b>private</b></font>: <br>
	775	RandomStreamPush(<font color="#008b00"><b>const</b></font> RandomStreamPush&); <font color="#0000ed"><i>// disable</i></font><br>
	776	<font color="#008b00"><b>void</b></font> <font color="#b02f60"><b>operator</b></font>=(<font color="#008b00"><b>const</b></font> RandomStreamPush&); <font color="#0000ed"><i>// disable</i></font><br>
	777	};<br>
	778	<br>
	779	<br>
	780	<font color="#008b00"><b>void</b></font> DeriveKey(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>char</b></font> *key, <font color="#008b00"><b>long</b></font> klen,  <br>
	781	<font color="#008b00"><b>const</b></font> <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>char</b></font> *data, <font color="#008b00"><b>long</b></font> dlen);<br>
	782	<font color="#0000ed"><i>// utility routine to derive from the byte string (data, dlen) a byte string</i></font><br>
	783	<font color="#0000ed"><i>// (key, klen).  Heuristically, if (data, dlen) has high entropy, then (key,</i></font><br>
	784	<font color="#0000ed"><i>// klen) should be pseudorandom.  This routine is also used internally to</i></font><br>
	785	<font color="#0000ed"><i>// derive PRG keys.</i></font><br>
	786	<font color="#0000ed"><i>// EXCEPTIONS: throws LogicError exception if klen < 0 or hlen < 0</i></font><br>
	787	<br>
712	788	<br>
713	789	<br>
714	790	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>

+89

-13

doc/ZZ.txt less more

318	318	};
319	319
320	320	long InvModStatus(ZZ& x, const ZZ& a, const ZZ& n);
321		// if gcd(a,b) = 1, then return-value = 0, x = a^{-1} mod n;
	321	// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;
322	322	// otherwise, return-value = 1, x = gcd(a, n)
323	323
324	324	void PowerMod(ZZ& x, const ZZ& a, const ZZ& e, const ZZ& n);

392	392
393	393	long InvMod(long a, long n);
394	394	// computes a^{-1} mod n. Error is raised if undefined.
	395
	396	long InvModStatus(long& x, long a, long n);
	397	// if gcd(a,n) = 1, then return-value = 0, x = a^{-1} mod n;
	398	// otherwise, return-value = 1, x = gcd(a, n)
395	399
396	400	long PowerMod(long a, long e, long n);
397	401	// computes a^e mod n (e may be negative)

646	650	// NumBytes(0) == 0.
647	651
648	652
	653	// @anchor{prg}
649	654
650	655	/**************************************************************************\
651	656

665	670	// seen by a client program.
666	671
667	672
668		void SetSeed(const ZZ& s);
669		// Initializes generator with a "seed" s.
670		// s is first hashed to generate the initial state, so it is
671		// not necessary that s itself looks random, just that
672		// it has a lot of "entropy".
673		// If SetSeed is not called before using the routines below,
674		// a default initial seed is used.
675		// This default seed is guaranteed to be unique among different
676		// threads in a given process, and an attempt is made to
677		// make this seed globally unique among all threads and processes.
678		// Routine ZZFromBytes (above) may be useful for constructing seeds
679		// from arbitrary binary data.
	673	void SetSeed(const ZZ& s);
	674	void SetSeed(const unsigned char *data, long dlen);
	675	void SetSeed(const RandomStream& s);
	676	// Initializes generator with a "seed".
	677
	678	// The first version hashes the binary representation of s to obtain a key for
	679	// a low-level RandomStream object (see below).
	680
	681	// The second version does the same, hashing the first dlen bytes pointed to by
	682	// data to obtain a key for the RandomStream object.
	683
	684	// The third version initializes the PRG state directly with the given
	685	// RandomStream object.
	686
	687	// EXCEPTIONS: strong ES
680	688
681	689
682	690	void RandomBnd(ZZ& x, const ZZ& n);
683	691	ZZ RandomBnd(const ZZ& n);
	692	void RandomBnd(long& x, long n);
684	693	long RandomBnd(long n);
685	694	// x = pseudo-random number in the range 0..n-1, or 0 if n <= 0
	695	// EXCEPTIONS: strong ES
686	696
687	697	void RandomBits(ZZ& x, long l);
688	698	ZZ RandomBits_ZZ(long l);
	699	void RandomBits(long& x, long l);
689	700	long RandomBits_long(long l);
690	701	// x = pseudo-random number in the range 0..2^l-1.
	702	// EXCEPTIONS: strong ES
691	703
692	704	void RandomLen(ZZ& x, long l);
693	705	ZZ RandomLen_ZZ(long l);
	706	void RandomLen(long& x, long l);
694	707	long RandomLen_long(long l);
695	708	// x = psuedo-random number with precisely l bits,
696	709	// or 0 of l <= 0.
	710	// EXCEPTIONS: strong ES
697	711
698	712	unsigned long RandomBits_ulong(long l);
699	713	// returns a pseudo-random number in the range 0..2^l-1
	714	// EXCEPTIONS: strong ES
700	715
701	716	unsigned long RandomWord();
702	717	// returns a word filled with pseudo-random bits.
703	718	// Equivalent to RandomBits_ulong(NTL_BITS_PER_LONG).
	719	// EXCEPTIONS: strong ES
	720
	721
	722
	723	class RandomStream {
	724	// The low-level pseudo-random generator (PRG).
	725	// After initializing it with a key, one can effectively read an unbounded
	726	// stream of pseudorandom bytes
	727
	728	public:
	729
	730	explicit RandomStream(const unsigned char *key);
	731	// key should point to an array of NTL_PRG_KEYLEN bytes
	732	// EXCEPTIONS: nothrow
	733
	734	void get(unsigned char *res, long n);
	735	// read the next n bytes from the stream and store to location pointed to by
	736	// res
	737	// EXCEPTIONS: throws a LogicError exception if n is negative
	738
	739	RandomStream(const RandomStream&); // default
	740	RandomStream& operator=(const RandomStream&); // default
	741	// EXCEPTIONS: nothrow
	742	};
	743
	744
	745	RandomStream& GetCurrentRandomStream();
	746	// get reference to the current PRG state. If SetSeed has not been called, it
	747	// is called with a default value (which should be unique to each
	748	// process/thread). NOTE: this is a reference to a thread-local object, so
	749	// different threads will use different PRG's, and by default, each will be
	750	// initialized with a unique seed.
	751	// NOTE: using this reference, you can copy the current PRG state or assign a
	752	// different value to it; however, see the helper class RandomStreamPush below,
	753	// which may be more convenient.
	754	// EXCEPTIONS: strong ES
	755
	756
	757
	758	class RandomStreamPush {
	759	// RAII for saving/restoring current PRG state
	760	public:
	761	RandomStreamPush(); // save a copy of the current PRG state
	762	// EXCEPTIONS: strong ES
	763
	764	~RandomStreamPush(); // restore the saveed copy of the PRG state
	765
	766	private:
	767	RandomStreamPush(const RandomStreamPush&); // disable
	768	void operator=(const RandomStreamPush&); // disable
	769	};
	770
	771
	772	void DeriveKey(unsigned char *key, long klen,
	773	const unsigned char *data, long dlen);
	774	// utility routine to derive from the byte string (data, dlen) a byte string
	775	// (key, klen). Heuristically, if (data, dlen) has high entropy, then (key,
	776	// klen) should be pseudorandom. This routine is also used internally to
	777	// derive PRG keys.
	778	// EXCEPTIONS: throws LogicError exception if klen < 0 or hlen < 0
	779
704	780
705	781
706	782	/**************************************************************************\

-1

doc/ZZVec.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZVec.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZVec.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/ZZX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/ZZXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-2

doc/ZZ_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

274	274	<br>
275	275	<font color="#0000ed"><i>      ZZ_p::init(p2); // install p2</i></font><br>
276	276	<br>
277		<font color="#0000ed"><i>      // reinstall original modulus as close of scope</i></font><br>
	277	<font color="#0000ed"><i>      // reinstall original modulus at close of scope</i></font><br>
278	278	<font color="#0000ed"><i>   }</i></font><br>
279	279	<br>
280	280	<font color="#0000ed"><i>      </i></font><br>

-1

doc/ZZ_p.txt less more

266	266
267	267	ZZ_p::init(p2); // install p2
268	268
269		// reinstall original modulus as close of scope
	269	// reinstall original modulus at close of scope
270	270	}
271	271
272	272

-1

doc/ZZ_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/ZZ_pEX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pEX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pEX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-2

doc/ZZ_pEXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pEXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pEXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

64	64	<font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
65	65	<font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
66	66	<font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
67		<font color="#0000ed"><i>// form ddf--baby- and ddf--giant-.</i></font><br>
	67	<font color="#0000ed"><i>// form tmp-*.</i></font><br>
68	68	<font color="#0000ed"><i>// The definition of "large" is controlled by the variable</i></font><br>
69	69	<br>
70	70	<font color="#008b00"><b>extern</b></font> <font color="#008b00"><b>double</b></font> ZZ_pEXFileThresh<br>

-1

doc/ZZ_pEXFactoring.txt less more

56	56	// this routine uses external files to store some intermediate
57	57	// results, which are removed if the routine terminates normally.
58	58	// These files are stored in the current directory under names of the
59		// form ddf--baby- and ddf--giant-.
	59	// form tmp-*.
60	60	// The definition of "large" is controlled by the variable
61	61
62	62	extern double ZZ_pEXFileThresh

-1

doc/ZZ_pX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

464	464	<font color="#008b00"><b>void</b></font> MulByXMod(ZZ_pX& x, <font color="#008b00"><b>const</b></font> ZZ_pX& a, <font color="#008b00"><b>const</b></font> ZZ_pX& f);<br>
465	465	ZZ_pX MulByXMod(<font color="#008b00"><b>const</b></font> ZZ_pX& a, <font color="#008b00"><b>const</b></font> ZZ_pX& f);<br>
466	466	<font color="#0000ed"><i>// x = (a * X) mod f</i></font><br>
	467	<font color="#0000ed"><i>// NOTE: thread boosting enabled only if x does not alias a</i></font><br>
467	468	<br>
468	469	<font color="#008b00"><b>void</b></font> InvMod(ZZ_pX& x, <font color="#008b00"><b>const</b></font> ZZ_pX& a, <font color="#008b00"><b>const</b></font> ZZ_pX& f);<br>
469	470	ZZ_pX InvMod(<font color="#008b00"><b>const</b></font> ZZ_pX& a, <font color="#008b00"><b>const</b></font> ZZ_pX& f);<br>

-0

doc/ZZ_pX.txt less more

456	456	void MulByXMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& f);
457	457	ZZ_pX MulByXMod(const ZZ_pX& a, const ZZ_pX& f);
458	458	// x = (a * X) mod f
	459	// NOTE: thread boosting enabled only if x does not alias a
459	460
460	461	void InvMod(ZZ_pX& x, const ZZ_pX& a, const ZZ_pX& f);
461	462	ZZ_pX InvMod(const ZZ_pX& a, const ZZ_pX& f);

-2

doc/ZZ_pXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/ZZ_pXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/ZZ_pXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

81	81	<font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
82	82	<font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
83	83	<font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
84		<font color="#0000ed"><i>// form ddf--baby- and ddf--giant-.  </i></font><br>
	84	<font color="#0000ed"><i>// form tmp-*.</i></font><br>
85	85	<font color="#0000ed"><i>// The definition of "large" is controlled by the variable</i></font><br>
86	86	<br>
87	87	<font color="#008b00"><b>extern</b></font> <font color="#008b00"><b>double</b></font> ZZ_pXFileThresh<br>

-1

doc/ZZ_pXFactoring.txt less more

73	73	// this routine uses external files to store some intermediate
74	74	// results, which are removed if the routine terminates normally.
75	75	// These files are stored in the current directory under names of the
76		// form ddf--baby- and ddf--giant-.
	76	// form tmp-*.
77	77	// The definition of "large" is controlled by the variable
78	78
79	79	extern double ZZ_pXFileThresh

+98

-55

doc/config.txt less more

17	17	########### Here are the most important variables, and their default values.
18	18
19	19	CXX=g++ # The C++ compiler
	20
20	21	CXXFLAGS=-g -O2 # C++ complilation flags
21	22
22		DEF_PREFIX=/usr/local # Default software directory
	23	NATIVE=on # compiles code targeted to current hardware
	24
	25	DEF_PREFIX=/usr/local# Default software directory
	26
23	27	PREFIX=$(DEF_PREFIX) # Directory in which to install NTL library components
24	28	SHARED=off # Generate a shared library (as well as static)
25	29
26	30	NTL_THREADS=off # compile in thread-safe mode
27		NTL_EXCEPTIONS=off # compile in thread-safe mode
28
29		NTL_GMP_LIP=off # Switch to enable the use of GMP as primary
	31	NTL_THREAD_BOOST=off # compile with thread boosting enabled
	32	NTL_EXCEPTIONS=off # compile in exception-safe mode
	33
	34	NTL_GMP_LIP=on # Switch to enable the use of GMP as primary
30	35	# long integer package
31	36
32	37	GMP_PREFIX=$(DEF_PREFIX) # Directory in which GMP components are installed
33	38
34		NTL_PCLMUL=off # switch to enable the PCLMUL instruction
35		# on x86 machines for faster arithmetic over
36		# GF(2)[X] (without relying on the gf2x package)
37	39
38	40	NTL_GF2X_LIB=off # Switch to enable the use of the gf2x package
39	41	# for faster arithmetic over GF(2)[X]

56	58
57	59	CXXFLAGS=-g -O2
58	60
59		# Flags for the C++ compiler
60		#
61		# Note that if CXXFLAGS has not been explicitly set,
62		# then the option -std=c++11 is added if either NTL_THREADS
63		# or NTL_EXCEPTIONS is set, and the option -pthread is added
64		# if NTL_THREADS is set.
	61	# Flags for the C++ compiler.
	62
	63
	64	NATIVE=on
	65
	66	# Flag to target code to current hardware.
	67
65	68
66	69
67	70	########## Installation path:

113	116	# mutexes, and thread_local storage. Your compiler may not
114	117	# yet support these features.
115	118
	119	# Note that this option is currently only supported with
	120	# NTL_GMP_LIP=on.
	121
	122	########## thread boosting
	123
	124	NTL_THREAD_BOOST=off
	125
	126	# Set to 'on' if you want to compile NTL so that is does
	127	# certain internal computations using multiple threads.
	128	# Setting this flag automatically sets the NTL_THREADS flag.
	129	# This feature is a work in progress. See documentation in
	130	# BasicThreadPool.txt for more details.
116	131
117	132	########## exceptions
118	133

125	140
126	141	########## GMP variables:
127	142
128		NTL_GMP_LIP=off
129
130		# Set to 'on' if you want to use GMP, the GNU Multi-Precision package,
	143	NTL_GMP_LIP=on
	144
	145	# Set to 'off' if you don't want to use GMP, the GNU Multi-Precision package,
131	146	# as the primary long integer package.
132		# This will typically yield significantly faster long integer arithmetic
133		# compared to the traditional long integer package.
134
135		# If you set this flag, please note the following.
136		# If you have installed GMP in a standard "system" location, this is
137		# all you have to do. Otherwise, if GMP is built, but not installed
138		# in a standard place, you have to set the variable GMP_PREFIX.
	147	# This will lead to significantly slower code, and is not
	148	# recommended.
139	149
140	150
141	151	GMP_PREFIX=$(DEF_PREFIX)

149	159	# For finer-grained control, set the variables GMP_INCDIR and GMP_LIBDIR
150	160	# instead (see below).
151	161
152		######### PCLMUL accelerator:
153
154		NTL_PCLMUL=off
155
156		# set to 'on' if you want to enable the use of the PCLMUL
157		# instruction on x86 machines. The configuration script
158		# will adjust the makefile as well, and run a test program
159		# to make sure it really works.
160		#
161		# This is an aletrnative to using the gf2x library (see below).
162		# That library is currently not thread or exception safe.
163
164	162
165	163	########## GF2X variables:
166	164

206	204
207	205
208	206
209		# If GMP is installed in a standard system directory, and you want to use it:
210
211		./configure NTL_GMP_LIP=on
212
213
214	207
215	208	# If GMP was installed in a non-standard directory, say, $HOME/sw:
216	209
217		./configure NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
218
	210	./configure GMP_PREFIX=$HOME/sw
219	211
220	212
221	213	# If you want to use the options -g and -O for compiling C++,

226	218	# Note the use of quotes to keep the argument in one piece.
227	219
228	220
229		# If you want to use both GMP and the gf2x library:
230
231		./configure NTL_GMP_LIP=on NTL_GF2X_LIB=on
232
233
234		# If you want to use GMP as well as traditional (non-ISO) mode:
235
236		./configure NTL_GMP_LIP=on NTL_STD_CXX=off
237
	221	# If you want to use the gf2x library:
	222
	223	./configure NTL_GF2X_LIB=on
	224
	225
	226
	227	###########
	228	########### A little magic
	229	###########
	230
	231	CXXAUTOFLAGS=
	232
	233	# This is a variable that is automagically set by the configuration script.
	234	# These are C++ compiler flags that are selected depending on
	235	# the choice of other configuration options, and is geared towards gcc.
	236	# The configuration script always prints out the value it chooses.
	237	# If you explicitly set a value when invoking the configuration script,
	238	# then it will not change that value.
238	239
239	240
240	241

255	256	INCLUDEDIR=$(PREFIX)/include
256	257	DOCDIR=$(PREFIX)/share/doc
257	258
	259	NTL_DISABLE_TLS_HACK=off
	260	NTL_ENABLE_TLS_HACK=off
258	261
259	262	NTL_LEGACY_NO_NAMESPACE=off
260	263	NTL_LEGACY_INPUT_ERROR=off

271	274	NTL_NO_INIT_TRANS=off
272	275	NTL_DISABLE_LONGDOUBLE=off
273	276	NTL_DISABLE_LONGLONG=off
	277	NTL_DISABLE_LL_ASM=off
	278	NTL_MAXIMIZE_SP_NBITS=off
274	279
275	280	WIZARD=on
276	281	NTL_LONG_LONG=off

285	290	NTL_GF2X_NOINLINE=off
286	291	NTL_GF2X_ALTCODE=off
287	292	NTL_GF2X_ALTCODE1=off
	293	NTL_PCLMUL=off
288	294
289	295	GMP_INCDIR=$(GMP_PREFIX)/include
290	296	GMP_LIBDIR=$(GMP_PREFIX)/lib

338	344	LIBTOOL=libtool
339	345
340	346	# the libtool command -- only needed if SHARED=on
	347
341	348
342	349
343	350

367	374	# Execution of 'make install' copies header files into $(INCLUDEDIR)/NTL,
368	375	# copies the library itself to $(LIBDIR)/libntl.a, and copies the
369	376	# documentation files into $(DOCDIR)/NTL.
	377
	378	########## Disable/enable TLS hack
	379
	380	NTL_DISABLE_TLS_HACK=off
	381	NTL_ENABLE_TLS_HACK=off
	382
	383	# when building NTL with NTL_THREADS=on, if the compiler is gcc-compatible, a
	384	# "TLS hack" may be used to workaround the fact that many compilers do not
	385	# (correctly) implement C++11's thread_local feature. The workaround is to use
	386	# gcc's more limited __thread feature, and to emulate thread_local semantics
	387	# using pthread routines.
	388	#
	389	# "gcc-compatible" means that the "__GNUC__" macro is defined, which means the
	390	# TLS hack may be used for gcc, clang, and icc compilers. The current version
	391	# of NTL will enable this hack by default, but you can disable it by specifying
	392	# NTL_DISABLE_TLS_HACK=on. At some point in the future, this default behavior
	393	# may change, in which case you will still be able to force the TLS hack by
	394	# specifying NTL_ENABLE_TLS_HACK=on.
	395
	396
	397
370	398
371	399
372	400

480	508
481	509	NTL_DISABLE_LONGDOUBLE=off
482	510
483		# Explicitly disables us of long double arithmetic in the
484		# single-precision modular arithmetic routines
	511	# Explicitly disables use of long double arithmetic
485	512
486	513	NTL_DISABLE_LONGLONG=off
487	514
488		# Explicitly disables us of long long arithmetic in the
489		# single-precision modular arithmetic routines
	515	# Explicitly disables use of long long arithmetic
	516
	517	NTL_DISABLE_LL_ASM=off
	518
	519	# Explicitly disables use of inline asm as replacement for
	520	# long long arithmetic
	521
	522	NTL_MAXIMIZE_SP_NBITS=on
	523
	524	# Allows for 62-bit single-precision moduli on 64-bit platforms.
	525	# By default, such moduli are restricted to 60 bits, which
	526	# usually gives slightly better performance across a range of
	527	# of parameters.
490	528
491	529
492	530

600	638
601	639	# Yet another alternative implementation for GF2X multiplication.
602	640
	641	NTL_PCLMUL=off
	642
	643	# switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
	644	# over GF(2)[X] (without relying on the gf2x package)
	645
603	646
604	647
605	648	########## More GMP Options:

+13

-5

doc/copying.txt less more

1	1	COPYRIGHT NOTICE
2	2
3	3	NTL -- A Library for Doing Number Theory
4		Copyright (C) 1996-2015 Victor Shoup
	4	Copyright (C) 1996-2016 Victor Shoup
5	5
6	6	The most recent version of NTL is available at http://www.shoup.net
7	7

28	28	distributions. In general, the individual files do not contain
29	29	copyright notices.
30	30
	31	Note that the the file ZZ.c contains an implementation of SHA256
	32	which is derived from work by Brad Conte, which is in the public domain.
	33	See file ZZ.c for a more detailed notice.
	34
	35	Note that the file mat_lzz_p.c contains an implemention of Strassen's
	36	matrix multiplication algorithm which is derived from the implementation
	37	in FLINT v2.5.2. The latter is copyrighted by Martin Albrecht, William Hart,
	38	and Fredrik Johansson, and also licensed under te GPL.
	39	See file mat_lzz_p.c for a more detailed notice.
	40
31	41	Note that the quad_float package is derived from the doubledouble package,
32		originally developed by Keith Briggs, and also licensed unger the GNU GPL.
	42	originally developed by Keith Briggs, and also licensed under the GNU GPL.
33	43	The files quad_float.c and quad_float.h contain more detailed copyright
34	44	notices.
35	45

37	47	from---and represents an extensive modification of---
38	48	a package originally developed and copyrighted by Arjen Lenstra,
39	49	who has agreed to renounce any copyright claims on the particular
40		version of the long integer package appearing in NTL, so that the
	50	version of the long integer package appearing in NTL, so that
41	51	this package now is covered by the GNU GPL as well.
42	52
43	53	Note that the alternative long integer package used by NTL is GMP,
44	54	which is written by Torbjorn Granlund <tege@swox.com>.
45	55	GMP is licensed under the terms of the GNU Lesser General Public License.
46	56
47		Note that NTL makes use of the RSA Data Security, Inc. MD5 Message
48		Digest Algorithm.
49	57
50	58	Note that prior to version 4.0, NTL was distributed under the following terms:
51	59	NTL is freely available for research and educational purposes.

-1

doc/lzz_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/lzz_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/lzz_pEX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pEX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pEX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-2

doc/lzz_pEXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pEXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pEXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

64	64	<font color="#0000ed"><i>// this routine uses external files to store some intermediate</i></font><br>
65	65	<font color="#0000ed"><i>// results, which are removed if the routine terminates normally.</i></font><br>
66	66	<font color="#0000ed"><i>// These files are stored in the current directory under names of the</i></font><br>
67		<font color="#0000ed"><i>// form ddf--baby- and ddf--giant-.</i></font><br>
	67	<font color="#0000ed"><i>// form tmp-*.</i></font><br>
68	68	<font color="#0000ed"><i>// The definition of "large" is controlled by the variable</i></font><br>
69	69	<br>
70	70	<font color="#008b00"><b>extern</b></font> <font color="#008b00"><b>double</b></font> zz_pEXFileThresh<br>

-1

doc/lzz_pEXFactoring.txt less more

56	56	// this routine uses external files to store some intermediate
57	57	// results, which are removed if the routine terminates normally.
58	58	// These files are stored in the current directory under names of the
59		// form ddf--baby- and ddf--giant-.
	59	// form tmp-*.
60	60	// The definition of "large" is controlled by the variable
61	61
62	62	extern double zz_pEXFileThresh

+40

-1

doc/lzz_pX.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pX.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pX.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

725	725	<font color="#0000ed"><i>// and the power projection and minimal polynomial routines below, </i></font><br>
726	726	<font color="#0000ed"><i>// and indirectly affects many routines in zz_pXFactoring.</i></font><br>
727	727	<br>
	728	<br>
	729	<a name="compmod"></a>
	730	<br>
	731	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>
	732	<br>
	733	<font color="#0000ed"><i>                     Faster Composition with Pre-Conditioning</i></font><br>
	734	<br>
	735	<font color="#0000ed"><i>A new, experimental version of composition with preconditioning.</i></font><br>
	736	<font color="#0000ed"><i>This interface was introduced in NTL v9.6.3, and it should be </i></font><br>
	737	<font color="#0000ed"><i>considered a preliminary interface and suvject to change (although</i></font><br>
	738	<font color="#0000ed"><i>it is likely to not change very much).</i></font><br>
	739	<br>
	740	<font color="#0000ed"><i>Usage:</i></font><br>
	741	<font color="#0000ed"><i>    zz_pX x, g, h;</i></font><br>
	742	<font color="#0000ed"><i>    zz_pXModulus F;</i></font><br>
	743	<font color="#0000ed"><i>    zz_pXArgument H;</i></font><br>
	744	<font color="#0000ed"><i>    build(H, h, F);</i></font><br>
	745	<font color="#0000ed"><i>    zz_pXAltArgument H1;</i></font><br>
	746	<font color="#0000ed"><i>    build(H1, H, F);  // this keeps a pointer to H, so H must remain alive</i></font><br>
	747	<font color="#0000ed"><i>    CompMod(x, g, H1, F);  // x = g(h) mod f</i></font><br>
	748	<br>
	749	<font color="#0000ed"><i>The idea is that H1 stores the data in H in an alternative format</i></font><br>
	750	<font color="#0000ed"><i>that allows for a more cache-friendly and more efficient execution</i></font><br>
	751	<font color="#0000ed"><i>of CompMod.  Depending on a variety of factors, this can be up to </i></font><br>
	752	<font color="#0000ed"><i>about 3x faster than the redgular CompMod.</i></font><br>
	753	<br>
	754	<br>
	755	<font color="#0000ed"><i>\************************************************************************</i></font><font color="#0000ed"><i>/</i></font><br>
	756	<br>
	757	<font color="#008b00"><b>class</b></font>  zz_pXAltArgument { <br>
	758	<font color="#0000ed"><i>// ...</i></font><br>
	759	};<br>
	760	<br>
	761	<font color="#008b00"><b>void</b></font> build(zz_pXAltArgument& altH, <font color="#008b00"><b>const</b></font> zz_pXArgument& H, <font color="#008b00"><b>const</b></font> zz_pXModulus& F);<br>
	762	<font color="#008b00"><b>void</b></font> CompMod(zz_pX& x, <font color="#008b00"><b>const</b></font> zz_pX& g, <font color="#008b00"><b>const</b></font> zz_pXAltArgument& A, <br>
	763	<font color="#008b00"><b>const</b></font> zz_pXModulus& F);<br>
	764	<br>
	765	<br>
	766	<br>
728	767	<font color="#0000ed"><i>/</i></font><font color="#0000ed"><i>************************************************************************\</i></font><br>
729	768	<br>
730	769	<font color="#0000ed"><i>                     power projection routines</i></font><br>

+39

-0

doc/lzz_pX.txt less more

717	717	// and the power projection and minimal polynomial routines below,
718	718	// and indirectly affects many routines in zz_pXFactoring.
719	719
	720
	721	// @anchor{compmod}
	722
	723	/**************************************************************************\
	724
	725	Faster Composition with Pre-Conditioning
	726
	727	A new, experimental version of composition with preconditioning.
	728	This interface was introduced in NTL v9.6.3, and it should be
	729	considered a preliminary interface and suvject to change (although
	730	it is likely to not change very much).
	731
	732	Usage:
	733	zz_pX x, g, h;
	734	zz_pXModulus F;
	735	zz_pXArgument H;
	736	build(H, h, F);
	737	zz_pXAltArgument H1;
	738	build(H1, H, F); // this keeps a pointer to H, so H must remain alive
	739	CompMod(x, g, H1, F); // x = g(h) mod f
	740
	741	The idea is that H1 stores the data in H in an alternative format
	742	that allows for a more cache-friendly and more efficient execution
	743	of CompMod. Depending on a variety of factors, this can be up to
	744	about 3x faster than the redgular CompMod.
	745
	746
	747	\**************************************************************************/
	748
	749	class zz_pXAltArgument {
	750	// ...
	751	};
	752
	753	void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F);
	754	void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	755	const zz_pXModulus& F);
	756
	757
	758
720	759	/**************************************************************************\
721	760
722	761	power projection routines

-1

doc/lzz_pXFactoring.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/lzz_pXFactoring.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/lzz_pXFactoring.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-2

doc/mat_GF2.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_GF2.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_GF2.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

67	67	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
68	68	<br>
69	69	<font color="#008b00"><b>void</b></font> solve(GF2& d, vec_GF2& x, <font color="#008b00"><b>const</b></font> mat_GF2& A, <font color="#008b00"><b>const</b></font> vec_GF2& b);<br>
70		<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = det(A).  </i></font><br>
	70	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
71	71	<font color="#0000ed"><i>// If d != 0, solves x*A = b. </i></font><br>
	72	<br>
	73	<font color="#008b00"><b>void</b></font> solve(GF2& d, <font color="#008b00"><b>const</b></font> mat_GF2& A, vec_GF2& x, <font color="#008b00"><b>const</b></font> vec_GF2& b);<br>
	74	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	75	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
72	76	<br>
73	77	<font color="#008b00"><b>void</b></font> inv(GF2& d, mat_GF2& X, <font color="#008b00"><b>const</b></font> mat_GF2& A);<br>
74	78	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = det(A).  If d != 0,</i></font><br>

-1

doc/mat_GF2.txt less more

59	59	// X = transpose of A
60	60
61	61	void solve(GF2& d, vec_GF2& x, const mat_GF2& A, const vec_GF2& b);
62		// A is an n x n matrix, b is a length n vector. Computes d = det(A).
	62	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
63	63	// If d != 0, solves x*A = b.
	64
	65	void solve(GF2& d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b);
	66	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	67	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
64	68
65	69	void inv(GF2& d, mat_GF2& X, const mat_GF2& A);
66	70	// A is an n x n matrix. Computes d = det(A). If d != 0,

-5

doc/mat_GF2E.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_GF2E.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_GF2E.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

63	63	mat_GF2E transpose(<font color="#008b00"><b>const</b></font> mat_GF2E& A);<br>
64	64	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
65	65	<br>
66		<font color="#008b00"><b>void</b></font> solve(GF2E& d, vec_GF2E& X,<br>
67		<font color="#008b00"><b>const</b></font> mat_GF2E& A, <font color="#008b00"><b>const</b></font> vec_GF2E& b);<br>
68		<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d =</i></font><br>
69		<font color="#0000ed"><i>// determinant(A).  If d != 0, solves x*A = b.</i></font><br>
	66	<font color="#008b00"><b>void</b></font> solve(GF2E& d, vec_GF2E& x, <font color="#008b00"><b>const</b></font> mat_GF2E& A, <font color="#008b00"><b>const</b></font> vec_GF2E& b);<br>
	67	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	68	<font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
	69	<br>
	70	<font color="#008b00"><b>void</b></font> solve(GF2E& d, <font color="#008b00"><b>const</b></font> mat_GF2E& A, vec_GF2E& x, <font color="#008b00"><b>const</b></font> vec_GF2E& b);<br>
	71	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	72	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
70	73	<br>
71	74	<font color="#008b00"><b>void</b></font> inv(GF2E& d, mat_GF2E& X, <font color="#008b00"><b>const</b></font> mat_GF2E& A);<br>
72	75	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = determinant(A).  If d != 0,</i></font><br>

-4

doc/mat_GF2E.txt less more

55	55	mat_GF2E transpose(const mat_GF2E& A);
56	56	// X = transpose of A
57	57
58		void solve(GF2E& d, vec_GF2E& X,
59		const mat_GF2E& A, const vec_GF2E& b);
60		// A is an n x n matrix, b is a length n vector. Computes d =
61		// determinant(A). If d != 0, solves x*A = b.
	58	void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b);
	59	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	60	// If d != 0, solves x*A = b.
	61
	62	void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b);
	63	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	64	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
62	65
63	66	void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A);
64	67	// A is an n x n matrix. Computes d = determinant(A). If d != 0,

-1

doc/mat_RR.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_RR.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_RR.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/mat_ZZ.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-5

doc/mat_ZZ_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

59	59	mat_ZZ_p transpose(<font color="#008b00"><b>const</b></font> mat_ZZ_p& A);<br>
60	60	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
61	61	<br>
62		<font color="#008b00"><b>void</b></font> solve(ZZ_p& d, vec_ZZ_p& X,<br>
63		<font color="#008b00"><b>const</b></font> mat_ZZ_p& A, <font color="#008b00"><b>const</b></font> vec_ZZ_p& b);<br>
64		<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d =</i></font><br>
65		<font color="#0000ed"><i>// determinant(A).  If d != 0, solves x*A = b.</i></font><br>
	62	<font color="#008b00"><b>void</b></font> solve(ZZ_p& d, vec_ZZ_p& x, <font color="#008b00"><b>const</b></font> mat_ZZ_p& A, <font color="#008b00"><b>const</b></font> vec_ZZ_p& b);<br>
	63	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	64	<font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
	65	<br>
	66	<font color="#008b00"><b>void</b></font> solve(zz_p& d, <font color="#008b00"><b>const</b></font> mat_zz_p& A, vec_zz_p& x, <font color="#008b00"><b>const</b></font> vec_zz_p& b);<br>
	67	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	68	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
66	69	<br>
67	70	<font color="#008b00"><b>void</b></font> inv(ZZ_p& d, mat_ZZ_p& X, <font color="#008b00"><b>const</b></font> mat_ZZ_p& A);<br>
68	71	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = determinant(A).  If d != 0,</i></font><br>

-4

doc/mat_ZZ_p.txt less more

51	51	mat_ZZ_p transpose(const mat_ZZ_p& A);
52	52	// X = transpose of A
53	53
54		void solve(ZZ_p& d, vec_ZZ_p& X,
55		const mat_ZZ_p& A, const vec_ZZ_p& b);
56		// A is an n x n matrix, b is a length n vector. Computes d =
57		// determinant(A). If d != 0, solves x*A = b.
	54	void solve(ZZ_p& d, vec_ZZ_p& x, const mat_ZZ_p& A, const vec_ZZ_p& b);
	55	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	56	// If d != 0, solves x*A = b.
	57
	58	void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b);
	59	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	60	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
58	61
59	62	void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A);
60	63	// A is an n x n matrix. Computes d = determinant(A). If d != 0,

-5

doc/mat_ZZ_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_ZZ_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_ZZ_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

61	61	mat_ZZ_pE transpose(<font color="#008b00"><b>const</b></font> mat_ZZ_pE& A);<br>
62	62	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
63	63	<br>
64		<font color="#008b00"><b>void</b></font> solve(ZZ_pE& d, vec_ZZ_pE& X,<br>
65		<font color="#008b00"><b>const</b></font> mat_ZZ_pE& A, <font color="#008b00"><b>const</b></font> vec_ZZ_pE& b);<br>
66		<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d =</i></font><br>
67		<font color="#0000ed"><i>// determinant(A).  If d != 0, solves x*A = b.</i></font><br>
	64	<font color="#008b00"><b>void</b></font> solve(ZZ_pE& d, vec_ZZ_pE& x, <font color="#008b00"><b>const</b></font> mat_ZZ_pE& A, <font color="#008b00"><b>const</b></font> vec_ZZ_pE& b);<br>
	65	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	66	<font color="#0000ed"><i>// If d != 0, solves x*A = b.</i></font><br>
	67	<br>
	68	<font color="#008b00"><b>void</b></font> solve(ZZ_pE& d, <font color="#008b00"><b>const</b></font> mat_ZZ_pE& A, vec_ZZ_pE& x, <font color="#008b00"><b>const</b></font> vec_ZZ_pE& b);<br>
	69	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	70	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
68	71	<br>
69	72	<font color="#008b00"><b>void</b></font> inv(ZZ_pE& d, mat_ZZ_pE& X, <font color="#008b00"><b>const</b></font> mat_ZZ_pE& A);<br>
70	73	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = determinant(A).  If d != 0,</i></font><br>

-4

doc/mat_ZZ_pE.txt less more

53	53	mat_ZZ_pE transpose(const mat_ZZ_pE& A);
54	54	// X = transpose of A
55	55
56		void solve(ZZ_pE& d, vec_ZZ_pE& X,
57		const mat_ZZ_pE& A, const vec_ZZ_pE& b);
58		// A is an n x n matrix, b is a length n vector. Computes d =
59		// determinant(A). If d != 0, solves x*A = b.
	56	void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b);
	57	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	58	// If d != 0, solves x*A = b.
	59
	60	void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b);
	61	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	62	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
60	63
61	64	void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A);
62	65	// A is an n x n matrix. Computes d = determinant(A). If d != 0,

+72

-22

doc/mat_lzz_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_lzz_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_lzz_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

13	13	<font color="#0000ed"><i>SUMMARY:</i></font><br>
14	14	<br>
15	15	<font color="#0000ed"><i>Defines the class mat_zz_p.</i></font><br>
	16	<font color="#0000ed"><i>Note that the modulus p need not be a prime, except as indicated below.</i></font><br>
	17	<br>
	18	<font color="#0000ed"><i>IMPLEMENTATION NOTES: </i></font><br>
	19	<br>
	20	<font color="#0000ed"><i>Starting with NTL version 9.7.0 (and 9.7.1), many of the routines here have</i></font><br>
	21	<font color="#0000ed"><i>been optimized to take better advantage of specific hardware features available</i></font><br>
	22	<font color="#0000ed"><i>on 64-bit Intel CPU's.  Currently, the mul, inv, determinant, solve, gauss,</i></font><br>
	23	<font color="#0000ed"><i>kernel, and image routines are fastest for p up to 23-bits long (assuming the</i></font><br>
	24	<font color="#0000ed"><i>CPU supports AVX instructions).  After that, performance degrades in three</i></font><br>
	25	<font color="#0000ed"><i>stages: stage 1: up to 28-bits; stage 2: up to 31-bits; stage 3: 32-bits and</i></font><br>
	26	<font color="#0000ed"><i>up. </i></font><br>
	27	<br>
	28	<font color="#0000ed"><i>For primes up to 23-bits, AVX floating point instructions are used.  After</i></font><br>
	29	<font color="#0000ed"><i>that, ordinary integer arithmetic is used.  In a future version, I may exploit</i></font><br>
	30	<font color="#0000ed"><i>AVX2 integer instructions to get better stage 2 performance.  And in the more</i></font><br>
	31	<font color="#0000ed"><i>distant future, AVX512 instructions will be used, when they become available.</i></font><br>
	32	<br>
	33	<font color="#0000ed"><i>On older Intel machines, or non-Intel machines that have "long long" support,</i></font><br>
	34	<font color="#0000ed"><i>one still gets optimizations corresponding to the three stages above.  On</i></font><br>
	35	<font color="#0000ed"><i>32-bit machines, one still gets three stages, just with smaller crossover</i></font><br>
	36	<font color="#0000ed"><i>points.</i></font><br>
16	37	<br>
17	38	<font color="#0000ed"><i>\************************************************************************</i></font><font color="#0000ed"><i>/</i></font><br>
18	39	<br>

47	68	<font color="#0000ed"><i>// X = a * B</i></font><br>
48	69	<br>
49	70	<br>
	71	<font color="#008b00"><b>void</b></font> transpose(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
	72	mat_zz_p transpose(<font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
	73	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
	74	<br>
	75	<br>
50	76	<font color="#008b00"><b>void</b></font> determinant(zz_p& d, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
51	77	zz_p determinant(<font color="#008b00"><b>const</b></font> mat_zz_p& a); <br>
52	78	<font color="#0000ed"><i>// d = determinant(A)</i></font><br>
53	79	<br>
54		<br>
55		<font color="#008b00"><b>void</b></font> transpose(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
56		mat_zz_p transpose(<font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
57		<font color="#0000ed"><i>// X = transpose of A</i></font><br>
58		<br>
59		<font color="#008b00"><b>void</b></font> solve(zz_p& d, vec_zz_p& X,<br>
60		<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> vec_zz_p& b);<br>
61		<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d =</i></font><br>
62		<font color="#0000ed"><i>// determinant(A).  If d != 0, solves x*A = b.</i></font><br>
	80	<font color="#008b00"><b>void</b></font> solve(zz_p& d, vec_zz_p& x, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> vec_zz_p& b);<br>
	81	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	82	<font color="#0000ed"><i>// If d != 0, solves x*A = b (so x and b are treated as a row vectors).</i></font><br>
	83	<br>
	84	<font color="#008b00"><b>void</b></font> solve(zz_p& d, <font color="#008b00"><b>const</b></font> mat_zz_p& A, vec_zz_p& x, <font color="#008b00"><b>const</b></font> vec_zz_p& b);<br>
	85	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	86	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
63	87	<br>
64	88	<font color="#008b00"><b>void</b></font> inv(zz_p& d, mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
65	89	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = determinant(A).  If d != 0,</i></font><br>
66	90	<font color="#0000ed"><i>// computes X = A^{-1}.</i></font><br>
67	91	<br>
	92	<br>
	93	<font color="#008b00"><b>void</b></font> inv(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
	94	mat_zz_p inv(<font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
	95	<font color="#0000ed"><i>// X = A^{-1}; error is raised if A is  singular</i></font><br>
	96	<br>
	97	<font color="#008b00"><b>void</b></font> power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e);<br>
	98	mat_zz_p power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e);<br>
	99	<font color="#008b00"><b>void</b></font> power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e);<br>
	100	mat_zz_p power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e);<br>
	101	<font color="#0000ed"><i>// X = A^e; e may be negative (in which case A must be nonsingular).</i></font><br>
	102	<br>
	103	<font color="#0000ed"><i>// NOTE: the routines determinant, solve, inv, and power (with negative</i></font><br>
	104	<font color="#0000ed"><i>// exponent) all require that the modulus p is prime: during elimination, if a</i></font><br>
	105	<font color="#0000ed"><i>// non-zero pivot element does not have an inverse, and error is raised.  The</i></font><br>
	106	<font color="#0000ed"><i>// following "relaxed" versions of these routines will also work with prime</i></font><br>
	107	<font color="#0000ed"><i>// powers, if the optional parameter relax is true (which is the default).</i></font><br>
	108	<font color="#0000ed"><i>// However, note that in these relaxed routines, if a computed determinant</i></font><br>
	109	<font color="#0000ed"><i>// value is zero, this may not be the true determinant: all that you can assume</i></font><br>
	110	<font color="#0000ed"><i>// is that the true determinant is is not invertible mod p. If the parameter</i></font><br>
	111	<font color="#0000ed"><i>// relax==false, then these routines behave identically to their "unrelaxed"</i></font><br>
	112	<font color="#0000ed"><i>// counterparts.</i></font><br>
	113	<br>
	114	<font color="#008b00"><b>void</b></font> relaxed_determinant(zz_p& d, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	115	zz_p relaxed_determinant(<font color="#008b00"><b>const</b></font> mat_zz_p& a, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>); <br>
	116	<font color="#008b00"><b>void</b></font> relaxed_solve(zz_p& d, vec_zz_p& x, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> vec_zz_p& b, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	117	<font color="#008b00"><b>void</b></font> relaxed_solve(zz_p& d, <font color="#008b00"><b>const</b></font> mat_zz_p& A, vec_zz_p& x, <font color="#008b00"><b>const</b></font> vec_zz_p& b, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	118	<font color="#008b00"><b>void</b></font> relaxed_inv(zz_p& d, mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	119	<font color="#008b00"><b>void</b></font> relaxed_inv(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	120	mat_zz_p relaxed_inv(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	121	<font color="#008b00"><b>void</b></font> relaxed_power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	122	mat_zz_p relaxed_power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	123	<font color="#008b00"><b>void</b></font> relaxed_power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	124	mat_zz_p relaxed_power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e, <font color="#008b00"><b>bool</b></font> relax=<font color="#cc0000">true</font>);<br>
	125	<br>
	126	<br>
68	127	<font color="#008b00"><b>void</b></font> sqr(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
69	128	mat_zz_p sqr(<font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
70	129	<font color="#0000ed"><i>// X = A*A   </i></font><br>
71		<br>
72		<font color="#008b00"><b>void</b></font> inv(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
73		mat_zz_p inv(<font color="#008b00"><b>const</b></font> mat_zz_p& A);<br>
74		<font color="#0000ed"><i>// X = A^{-1}; error is raised if A is  singular</i></font><br>
75		<br>
76		<font color="#008b00"><b>void</b></font> power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e);<br>
77		mat_zz_p power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>const</b></font> ZZ& e);<br>
78		<br>
79		<font color="#008b00"><b>void</b></font> power(mat_zz_p& X, <font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e);<br>
80		mat_zz_p power(<font color="#008b00"><b>const</b></font> mat_zz_p& A, <font color="#008b00"><b>long</b></font> e);<br>
81		<font color="#0000ed"><i>// X = A^e; e may be negative (in which case A must be nonsingular).</i></font><br>
82		<br>
83	130	<br>
84	131	<font color="#008b00"><b>void</b></font> ident(mat_zz_p& X, <font color="#008b00"><b>long</b></font> n);<br>
85	132	mat_zz_p ident_mat_zz_p(<font color="#008b00"><b>long</b></font> n);<br>

112	159	<font color="#0000ed"><i>// Computes a basis for the kernel of the map x -> x*A. where x is a</i></font><br>
113	160	<font color="#0000ed"><i>// row vector.</i></font><br>
114	161	<br>
	162	<font color="#0000ed"><i>// NOTE: the gauss, image, and kernel routines all require that</i></font><br>
	163	<font color="#0000ed"><i>// the modulus p is prime. </i></font><br>
	164	<br>
115	165	<br>
116	166	<br>
117	167	<font color="#0000ed"><i>// miscellaneous:</i></font><br>

+71

-21

doc/mat_lzz_p.txt less more

5	5	SUMMARY:
6	6
7	7	Defines the class mat_zz_p.
	8	Note that the modulus p need not be a prime, except as indicated below.
	9
	10	IMPLEMENTATION NOTES:
	11
	12	Starting with NTL version 9.7.0 (and 9.7.1), many of the routines here have
	13	been optimized to take better advantage of specific hardware features available
	14	on 64-bit Intel CPU's. Currently, the mul, inv, determinant, solve, gauss,
	15	kernel, and image routines are fastest for p up to 23-bits long (assuming the
	16	CPU supports AVX instructions). After that, performance degrades in three
	17	stages: stage 1: up to 28-bits; stage 2: up to 31-bits; stage 3: 32-bits and
	18	up.
	19
	20	For primes up to 23-bits, AVX floating point instructions are used. After
	21	that, ordinary integer arithmetic is used. In a future version, I may exploit
	22	AVX2 integer instructions to get better stage 2 performance. And in the more
	23	distant future, AVX512 instructions will be used, when they become available.
	24
	25	On older Intel machines, or non-Intel machines that have "long long" support,
	26	one still gets optimizations corresponding to the three stages above. On
	27	32-bit machines, one still gets three stages, just with smaller crossover
	28	points.
8	29
9	30	\**************************************************************************/
10	31

39	60	// X = a * B
40	61
41	62
	63	void transpose(mat_zz_p& X, const mat_zz_p& A);
	64	mat_zz_p transpose(const mat_zz_p& A);
	65	// X = transpose of A
	66
	67
42	68	void determinant(zz_p& d, const mat_zz_p& A);
43	69	zz_p determinant(const mat_zz_p& a);
44	70	// d = determinant(A)
45	71
46
47		void transpose(mat_zz_p& X, const mat_zz_p& A);
48		mat_zz_p transpose(const mat_zz_p& A);
49		// X = transpose of A
50
51		void solve(zz_p& d, vec_zz_p& X,
52		const mat_zz_p& A, const vec_zz_p& b);
53		// A is an n x n matrix, b is a length n vector. Computes d =
54		// determinant(A). If d != 0, solves x*A = b.
	72	void solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b);
	73	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	74	// If d != 0, solves x*A = b (so x and b are treated as a row vectors).
	75
	76	void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b);
	77	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	78	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
55	79
56	80	void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A);
57	81	// A is an n x n matrix. Computes d = determinant(A). If d != 0,
58	82	// computes X = A^{-1}.
59	83
	84
	85	void inv(mat_zz_p& X, const mat_zz_p& A);
	86	mat_zz_p inv(const mat_zz_p& A);
	87	// X = A^{-1}; error is raised if A is singular
	88
	89	void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
	90	mat_zz_p power(const mat_zz_p& A, const ZZ& e);
	91	void power(mat_zz_p& X, const mat_zz_p& A, long e);
	92	mat_zz_p power(const mat_zz_p& A, long e);
	93	// X = A^e; e may be negative (in which case A must be nonsingular).
	94
	95	// NOTE: the routines determinant, solve, inv, and power (with negative
	96	// exponent) all require that the modulus p is prime: during elimination, if a
	97	// non-zero pivot element does not have an inverse, and error is raised. The
	98	// following "relaxed" versions of these routines will also work with prime
	99	// powers, if the optional parameter relax is true (which is the default).
	100	// However, note that in these relaxed routines, if a computed determinant
	101	// value is zero, this may not be the true determinant: all that you can assume
	102	// is that the true determinant is is not invertible mod p. If the parameter
	103	// relax==false, then these routines behave identically to their "unrelaxed"
	104	// counterparts.
	105
	106	void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax=true);
	107	zz_p relaxed_determinant(const mat_zz_p& a, bool relax=true);
	108	void relaxed_solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b, bool relax=true);
	109	void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax=true);
	110	void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax=true);
	111	void relaxed_inv(mat_zz_p& X, const mat_zz_p& A, bool relax=true);
	112	mat_zz_p relaxed_inv(const mat_zz_p& A, bool relax=true);
	113	void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax=true);
	114	mat_zz_p relaxed_power(const mat_zz_p& A, const ZZ& e, bool relax=true);
	115	void relaxed_power(mat_zz_p& X, const mat_zz_p& A, long e, bool relax=true);
	116	mat_zz_p relaxed_power(const mat_zz_p& A, long e, bool relax=true);
	117
	118
60	119	void sqr(mat_zz_p& X, const mat_zz_p& A);
61	120	mat_zz_p sqr(const mat_zz_p& A);
62	121	// X = A*A
63
64		void inv(mat_zz_p& X, const mat_zz_p& A);
65		mat_zz_p inv(const mat_zz_p& A);
66		// X = A^{-1}; error is raised if A is singular
67
68		void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
69		mat_zz_p power(const mat_zz_p& A, const ZZ& e);
70
71		void power(mat_zz_p& X, const mat_zz_p& A, long e);
72		mat_zz_p power(const mat_zz_p& A, long e);
73		// X = A^e; e may be negative (in which case A must be nonsingular).
74
75	122
76	123	void ident(mat_zz_p& X, long n);
77	124	mat_zz_p ident_mat_zz_p(long n);

104	151	// Computes a basis for the kernel of the map x -> x*A. where x is a
105	152	// row vector.
106	153
	154	// NOTE: the gauss, image, and kernel routines all require that
	155	// the modulus p is prime.
	156
107	157
108	158
109	159	// miscellaneous:

-3

doc/mat_lzz_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_lzz_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_lzz_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

61	61	mat_zz_pE transpose(<font color="#008b00"><b>const</b></font> mat_zz_pE& A);<br>
62	62	<font color="#0000ed"><i>// X = transpose of A</i></font><br>
63	63	<br>
64		<font color="#008b00"><b>void</b></font> solve(zz_pE& d, vec_zz_pE& X,<br>
65		<font color="#008b00"><b>const</b></font> mat_zz_pE& A, <font color="#008b00"><b>const</b></font> vec_zz_pE& b);<br>
	64	<font color="#008b00"><b>void</b></font> solve(zz_pE& d, vec_zz_pE& x, <font color="#008b00"><b>const</b></font> mat_zz_pE& A, <font color="#008b00"><b>const</b></font> vec_zz_pE& b);<br>
66	65	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d =</i></font><br>
67	66	<font color="#0000ed"><i>// determinant(A).  If d != 0, solves x*A = b.</i></font><br>
	67	<br>
	68	<font color="#008b00"><b>void</b></font> solve(zz_pE& d, <font color="#008b00"><b>const</b></font> mat_zz_pE& A, vec_zz_pE& x, <font color="#008b00"><b>const</b></font> vec_zz_pE& b);<br>
	69	<font color="#0000ed"><i>// A is an n x n matrix, b is a length n vector.  Computes d = determinant(A).</i></font><br>
	70	<font color="#0000ed"><i>// If d != 0, solves A*x = b (so x and b are treated as a column vectors).</i></font><br>
68	71	<br>
69	72	<font color="#008b00"><b>void</b></font> inv(zz_pE& d, mat_zz_pE& X, <font color="#008b00"><b>const</b></font> mat_zz_pE& A);<br>
70	73	<font color="#0000ed"><i>// A is an n x n matrix.  Computes d = determinant(A).  If d != 0,</i></font><br>

-2

doc/mat_lzz_pE.txt less more

53	53	mat_zz_pE transpose(const mat_zz_pE& A);
54	54	// X = transpose of A
55	55
56		void solve(zz_pE& d, vec_zz_pE& X,
57		const mat_zz_pE& A, const vec_zz_pE& b);
	56	void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b);
58	57	// A is an n x n matrix, b is a length n vector. Computes d =
59	58	// determinant(A). If d != 0, solves x*A = b.
	59
	60	void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b);
	61	// A is an n x n matrix, b is a length n vector. Computes d = determinant(A).
	62	// If d != 0, solves A*x = b (so x and b are treated as a column vectors).
60	63
61	64	void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A);
62	65	// A is an n x n matrix. Computes d = determinant(A). If d != 0,

-1

doc/mat_poly_ZZ.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_ZZ.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_ZZ.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/mat_poly_ZZ_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_ZZ_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_ZZ_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/mat_poly_lzz_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/mat_poly_lzz_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/mat_poly_lzz_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/matrix.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/matrix.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/matrix.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/pair.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/pair.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/pair.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/quad_float.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/quad_float.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/quad_float.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

+14

-1

doc/tools.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/tools.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/tools.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

75	75	<br>
76	76	<font color="#008b00"><b>long</b></font> min(<font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>int</b></font> b);<br>
77	77	<font color="#008b00"><b>long</b></font> max(<font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>int</b></font> b);<br>
	78	<br>
	79	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> min(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> b);<br>
	80	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> max(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> b);<br>
	81	<br>
	82	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> min(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> b);<br>
	83	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> max(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> b);<br>
	84	<br>
	85	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> min(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> b);<br>
	86	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> max(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> b);<br>
	87	<br>
	88	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> min(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> b);<br>
	89	<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> max(<font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>long</b></font> a, <font color="#008b00"><b>unsigned</b></font> <font color="#008b00"><b>int</b></font> b);<br>
	90	<br>
78	91	<br>
79	92	<font color="#008b00"><b>void</b></font> swap(<font color="#008b00"><b>long</b></font>& a, <font color="#008b00"><b>long</b></font>& b);<br>
80	93	<font color="#008b00"><b>void</b></font> swap(<font color="#008b00"><b>int</b></font>& a, <font color="#008b00"><b>int</b></font>& b);<br>

+13

-0

doc/tools.txt less more

67	67
68	68	long min(long a, int b);
69	69	long max(long a, int b);
	70
	71	unsigned int min(unsigned int a, unsigned int b);
	72	unsigned int max(unsigned int a, unsigned int b);
	73
	74	unsigned long min(unsigned long a, unsigned long b);
	75	unsigned long max(unsigned long a, unsigned long b);
	76
	77	unsigned long min(unsigned int a, unsigned long b);
	78	unsigned long max(unsigned int a, unsigned long b);
	79
	80	unsigned long min(unsigned long a, unsigned int b);
	81	unsigned long max(unsigned long a, unsigned int b);
	82
70	83
71	84	void swap(long& a, long& b);
72	85	void swap(int& a, int& b);

+455

-2

doc/tour-changes.html less more

14	14	A Tour of NTL: Summary of Changes
15	15	</p>
16	16	</h1>
	17
	18
	19	<p><hr><p>
	20	<h3>
	21	2016.05.30: Changes between NTL 9.8.1 and 9.9.0
	22	</h3>
	23
	24	<ul>
	25	<li>
	26	Added
	27	examples on how to use documentation on NTL's thread pools
	28	and parallel for loops:
	29	<a href="tour-ex7.html">see here</a>
	30	<li>
	31	The build procedure now puts files <tt>config_log.h</tt>
	32	and <tt>wizard_log.h</tt>
	33	in NTL's include directory.
	34	These files contain comments that document what choices were
	35	made during the build process,
	36	including the <tt>CXXAUTOFLAGS</tt> value.
	37	<li>
	38	Added <tt>elts()</tt> method to <tt>UniqueArray</tt> and <tt>AlignedArray</tt>
	39	(for compatibility with <tt>Vec</tt> class)
	40
	41	<li>
	42	Added <tt>get()</tt> and <tt>release()</tt> methods to <tt>OptionalVal</tt>
	43
	44	<li>
	45	Made constructors for <tt>PartitionInfo</tt> and <tt>BasicThreadPool</tt>
	46	<i>explicit</i>
	47
	48	<li>
	49	Cleaned up some pointer issues in <tt>mat_lzz_p.c</tt> (mainly academic)
	50
	51	<li>
	52	Definition of <tt>NTL_TLS_LOCAL_INIT</tt> ensures that var names
	53	a local reference, regardless of the implementation
	54
	55	<li>
	56	Allow <tt>p.move(q)</tt>, where <tt>p</tt> is <tt>a UniquePtr<T></tt>,
	57	<tt>q</tt> is a <tt>UniquePtr<Y></tt>,
	58	and <tt>Y</tt> converts to <tt>T</tt>.
	59
	60	<li>
	61	Introduced <tt>PreconditionedRemainder</tt> class
	62	for faster reduction of a <tt>ZZ</tt> modulo a fixed long.
	63	This is intended to make Chinese Remaindering type computations faster
	64	<ul>
	65	<li>
	66	for the time being,
	67	this is an undocumented feature which may be modified or removed
	68	in a future release
	69	</ul>
	70
	71	<li>
	72	Introduced <tt>ll_type</tt> and related routines which perform
	73	a restricted set of operations on a long-long-like type.
	74	It can be implemented via inline asm, and is a cleaner
	75	interface and sometimes faster.
	76	On x86-64/gcc platforms, the assembly code version is
	77	used and gives a modest speed boost.
	78	For all other platforms (including x86-64 with clang or icc),
	79	the assembly code is not used.
	80	I should really dynamically enable the assembly via the performance
	81	tuning wizard, but I don't do this yet.
	82	To explicitly disable the assembly code,
	83	configure with <tt>NTL_DISABLE_LL_ASM=on</tt>.
	84	<ul>
	85	<li>
	86	for the time being,
	87	this is an undocumented feature which may be modified or removed
	88	in a future release
	89	</ul>
	90
	91	</ul>
	92
	93
	94	<p><hr><p>
	95	<h3>
	96	2016.04.29: Changes between NTL 9.8.0 and 9.8.1
	97	</h3>
	98
	99	<ul>
	100	<li>
	101	Fixed an annoying issue that could cause a unnecessary
	102	ambiguities in client code when compiling with <tt>NTL_EXCEPTIONS=on</tt>
	103	</ul>
	104
	105
	106	<p><hr><p>
	107	<h3>
	108	2016.04.26: Changes between NTL 9.7.1 and 9.8.0
	109	</h3>
	110
	111	<ul>
	112	<p> <li>
	113	<b>Thread safety for the masses!</b>
	114
	115	<ul>
	116	<li>
	117	Previous versions of NTL required full <tt>C++11</tt>
	118	compliance to achieve thread safety
	119	<li>
	120	Unfortunately, many platforms (notably, Mac OSX)
	121	do not provide the necessary
	122	features - in particular, they do not provide full, correct support
	123	for "thread local storage" (TLS)
	124	<li>
	125	This new release (by default) will apply a "TLS hack"
	126	that works around this limitation (at least for
	127	gcc and gcc-compatible compilers such as clang and icc)
	128	<ul>
	129	<li>
	130	With this "hack", it is only required that gcc's
	131	more widely available <tt>__thread</tt>
	132	storage specifier be implemented, rather than the less widely available
	133	<tt>thread_local</tt> specifier (and it also makes direct use
	134	of the <tt>pthread</tt> library)
	135	<li>
	136	You can explicitly disable the hack by configuring NTL
	137	with <tt>NTL_DISABLE_TLS_HACK=on</tt>
	138	</ul>
	139	<li>
	140	This "hack" has been successfully
	141	tested on Linux with gcc 4.8.5
	142	and on Mac OSX 10.10 and 10.11 with clang
	143	<ul>
	144	<li>
	145	It should work with any gcc 4.8.x or higher
	146	<li>
	147	Many thanks to Justin Walker for pushing this issue and
	148	helping with the Mac OSX testing
	149	</ul>
	150	</ul>
	151
	152	<li><p>
	153	Fixed a "pseudo" bug in the test script: <tt>BitMatTest</tt>
	154	in <tt> make check</tt> was reporting "failure", but this was
	155	a bug in <tt>BitMatTest</tt>, not in NTL itself.
	156
	157	<li>
	158	Fixed a real bug in the <tt>ReleaseThreadPool</tt>
	159	function (although NTL internally does not use this function,
	160	so only client code that called it directly would be affected).
	161
	162
	163	</ul>
	164
	165	<p><hr><p>
	166	<h3>
	167	2016.04.20: Changes between NTL 9.7.0 and 9.7.1
	168	</h3>
	169
	170	<ul>
	171
	172	<li>
	173	Extended the performance improvements in
	174	<a href="mat_lzz_p.cpp.html">mat_lzz_p</a>
	175	to include the <tt>gauss</tt>, <tt>kernel</tt>,
	176	and <tt>image</tt> routines
	177
	178	<li>
	179	Generally improved
	180	performance for all of the <a href="mat_lzz_p.cpp.html">mat_lzz_p</a>,
	181	including an implementation of Strassen for matrix multiplication.
	182
	183	<li>
	184	Added the matrix/column vector <tt>solve</tt> routines
	185	to all other matrix classes (for consistency).
	186
	187	<p>
	188	<li>
	189	Fixed a compile-time bug that occured on certain platforms
	190	(mainly Windows).
	191
	192	<li>
	193	Made some of the steps in <tt>configure</tt> and <tt>make</tt>
	194	a bit more quiet (look at <tt>.log</tt> files for outputs).
	195
	196	</ul>
	197
	198
	199	<p><hr><p>
	200	<h3>
	201	2016.03.12: Changes between NTL 9.6.4 and 9.7.0
	202	</h3>
	203
	204	<ul>
	205
	206
	207
	208	<p>
	209	<li>
	210	Changes to <a href="mat_lzz_p.cpp.html">mat_lzz_p</a> module:
	211	<ul>
	212	<li>
	213	Improved performance of <tt>mul</tt>, <tt>inv</tt>, <tt>solve</tt>,
	214	and <tt>determinant</tt> routines:
	215	<ul>
	216	<li>
	217	more cache friendly
	218	<li>
	219	thread boosted
	220	<li>
	221	for small p (up to 23 bits), exploits
	222	AVX and FMA instructions (when available)
	223	<li>
	224	depending on many things,
	225	the new code can be anywhere between
	226	1.5x and 70x (!) times faster than the old code
	227	(part of that speedup up can be attributed to just how
	228	awful some of the old code was, rather than
	229	how brilliant the new code is)
	230	<li>
	231	on the SandyBridge and Haswell machines I was able to test,
	232	the new code is comparable in speed
	233	to
	234	<a href=" https://linbox-team.github.io/fflas-ffpack/">FFLAS/FFPACK</a>
	235	</ul>
	236	<li>
	237	Added "relaxed" versions of <tt>inv</tt>, <tt>solve</tt>, and
	238	<tt>determinant</tt>,
	239	which also now work for prime powers, not just primes
	240	<li>
	241	Added a new variant of <tt>solve</tt> routine to solve <tt>A*x = b</tt>
	242	for column vectors
	243	</ul>
	244
	245	<p>
	246	<li>Changes to <a href="BasicThreadPool.cpp.html">BasicThreadPool</a>
	247	module:
	248	<ul>
	249	<li>
	250	Added <tt>NTL_EXEC_RANGE</tt> and other functionality which makes writing
	251	"parallel for loops" simple (very similar to OpenMP),
	252	and the same source code will work regardless of whether
	253	threads or thread boosting is enabled.
	254
	255	<li>
	256	Backward incompatibilities:
	257	<ul>
	258	<li>
	259	<tt>NTLThreadPool</tt> is no longer directly accessible:
	260	new access functions are provided
	261	<li>
	262	Got rid of method <tt>SplitProblems</tt>, and made a more general/abstract
	263	class <tt>PartitionInfo</tt>
	264	</ul>
	265	</ul>
	266
	267
	268	<p>
	269	<li>
	270	Miscellaneous:
	271	<ul>
	272	<li>
	273	Improved crossover points for <tt>GF2X</tt> division
	274
	275	<li>
	276	Made access to thread local variables used in NTL faster
	277	by using GCC's <tt>__thread</tt> in place of <tt>thread_local</tt>,
	278	wherever possible
	279
	280	<li>
	281	Improved performance of <tt>vec_long</tt> to <tt>vec_zz_p</tt> conversion
	282
	283	<li>
	284	Made AVX and FMA detection more robust, requiring LP64
	285
	286	<li>
	287	Added <tt>InvModStatus</tt> for <tt>long</tt>'s
	288
	289	<li>
	290	Bumped <tt>FILE_THRESH</tt> to 1e12
	291	</ul>
	292	</ul>
	293
	294	<p><hr><p>
	295	<h3>
	296	2016.01.30: Changes between NTL 9.6.3 and 9.6.4
	297	</h3>
	298
	299	<ul>
	300	<li>
	301	Streamlined some of the installation scripts,
	302	so now the "heurstic selection of compiler flags"
	303	and the "nonstandard feature testing" procedures are more structured
	304	so as to be easier to extend in the future -- it is beginning to
	305	act more like a sort of "autoconf".
	306	<li>
	307	Fixed a couple of "buglets" in the header files.
	308	</ul>
	309
	310
	311	<p><hr><p>
	312	<h3>
	313	2016.01.26: Changes between NTL 9.6.2 and 9.6.3
	314	</h3>
	315
	316	<ul>
	317	<li>
	318	Some changes to the installation procedure:
	319	<ul>
	320	<li>
	321	For the Unix distribution, <tt>NTL_GMP_LIP</tt> is now
	322	<i>on</i> by default, which means that by default, NTL will use
	323	GMP.
	324	<li>
	325	By default, the configuration script will attempt a
	326	"native'' build by passing <tt>-march=native</tt>
	327	as a compiler flag.
	328	Most modern compilers support this, but the configuration script will
	329	check to make sure.
	330	<li>
	331	The <tt>NTL_PCLMUL</tt> flag (which enables the use of
	332	Intel's PCLMUL instruction) is now automagically set by the
	333	Wizard script.
	334	<li>
	335	The build script automatically checks for availability of Intel
	336	<tt>AVX</tt> intrinsics, which may be used to better
	337	optimize certain code.
	338	</ul>
	339	<li>
	340	A new modular composition implemention for <tt>zz_pX</tt>.
	341	This makes modular composition up to 3x faster, depending
	342	on several factors.
	343	<a href="lzz_pX.cpp.html#compmod">See here</a> for details.
	344
	345	<li>
	346	Improved performance for polynomial factoring over <tt>zz_pX</tt>
	347	using <tt>CanZass</tt>,
	348	using the improved modular composition routine (above)
	349	and better choice of baby step / giant step parameters.
	350	This leads to a 1.1x to 1.8x speedup, depending on several factors.
	351
	352	<li>
	353	Improved robustness of <tt>quad_float</tt> implementation:
	354	it should now work correctly on platforms that are too
	355	liberal in their use of FMA instructions.
	356
	357
	358	</ul>
	359
	360	<p><hr><p>
	361	<h3>
	362	2015.11.13: Changes between NTL 9.6.1 and 9.6.2
	363	</h3>
	364
	365	<ul>
	366	<li>
	367	More small tweaks and a new configuration variable:
	368	<pre>
	369	NTL_MAXIMIZE_SP_NBITS=off
	370
	371	# Allows for 62-bit single-precision moduli on 64-bit platforms.
	372	# By default, such moduli are restricted to 60 bits, which
	373	# usually gives slightly better performance across a range of
	374	# of parameters.
	375	</pre>
	376
	377	</ul>
	378
	379	<p><hr><p>
	380	<h3>
	381	2015.11.13: Changes between NTL 9.6.0 and 9.6.1
	382	</h3>
	383
	384	<ul>
	385	<li>
	386	Streamlined some awkard code in <tt>g_lip_impl.h</tt>.
	387	<li>
	388	Made <tt>QuickTest</tt> a bit quicker.
	389	<li>
	390	Fixed some documentation/packaging problems.
	391	</ul>
	392
	393	<p><hr><p>
	394	<h3>
	395	2015.11.10: Changes between NTL 9.5.0 and 9.6.0
	396	</h3>
	397
	398	<ul>
	399	<li>
	400	More performance tuning for <tt>ZZ_pX</tt> arithmetic.
	401
	402	<li>
	403	Added configuration variable <tt>CXXAUTOFLAGS</tt>,
	404	which is dynamically (and heuristically) set by the configuration
	405	script.
	406	This way, <tt>CXXFLAGS</tt> is not modified by the script.
	407	</ul>
	408
	409
	410	<p><hr><p>
	411	<h3>
	412	2015.10.20: Changes between NTL 9.4.0 and 9.5.0
	413	</h3>
	414
	415	<ul>
	416	<li>
	417	Added a new <i>thread boosting</i> feature.
	418	With this feature, certain code within NTL will use available
	419	threads to speed up certain computations on a multicore
	420	machine.
	421	This feature is enabled by setting <tt>NTL_THREAD_BOOST=on</tt>
	422	during configuration.
	423	See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
	424	for more information.
	425
	426	<p>
	427	This feature is a work in progress.
	428	Currently, basic <tt>ZZ_pX</tt> arithmetic has been thread boosted.
	429	More code will be boosted later.
	430
	431	<li>
	432	A bit more perfomance tuning for <tt>ZZ_pX</tt> arithmetic,
	433	and better crossovers for <tt>ZZX</tt> multiplcation.
	434
	435	</ul>
	436
	437	<p><hr><p>
	438	<h3>
	439	2015.9.22: Changes between NTL 9.3.0 and 9.4.0
	440	</h3>
	441
	442	<ul>
	443	<li>
	444	Performance tuning: <tt>ZZ_pX</tt> and <tt>zz_pX</tt> keep
	445	getting faster
	446
	447	<li>
	448	Upgrade to pseudo-random number generation:
	449	I replaced the underlying PRG with Chacha20 (replacing RC4)
	450	and the underlying key-derivation function with a function
	451	based on HMAC-SHA256 (replacing an MD5-based function).
	452	The new routines are faster and more secure.
	453	<p>
	454	I also expanded the PRG interface a bit:
	455	<a href="ZZ.cpp.html#prg">see here</a> for details.
	456
	457	<li>
	458	Bug fixes: fixed a (mostly dormant) bug in the <tt>ZZFromBytes</tt>
	459	routine (triggered only when <tt>n==0</tt>).
	460
	461	<li>
	462	Added documentation for classes <tt>RRPush</tt> and
	463	<tt>RROutputPush</tt>:
	464	<a href="RR.cpp.html#push">see here</a> for details.
	465
	466	</ul>
	467
	468
	469
17	470
18	471	<p><hr><p>
19	472	<h3>

61	514	This might change in the future.
62	515
63	516	<p>
64		For details, look <a href="ZZ.cpp.html#modarith">here</a>,
	517	For details, see <a href="ZZ.cpp.html#modarith">here</a>,
65	518	including the comments entitled "Compatibility notes".
66	519
67	520	<p>

231	684	interface (although this is not recommended practice).
232	685
233	686	<p>
234		For details, look <a href="ZZ.cpp.html#modarith">here</a>,
	687	For details, <a href="ZZ.cpp.html#modarith">see here</a>,
235	688	including the comments entitled "Compatibility notes".
236	689
237	690	<p>

+20

-20

doc/tour-ex5.html less more

25	25	#include <NTL/ZZ_pXFactoring.h>
26	26	#include <NTL/ZZ_pEX.h>
27	27
28		NTL_CLIENT
	28	using namespace std;
	29	using namespace NTL;
29	30
30	31	int main()
31	32	{

53	54	}
54	55	ENDPLAIN -->
55	56	<!-- STARTPRETTY {{{ -->
56		<p><p><table cellPadding=10px><tr><td><font color="#000000">
57		<font face="monospace">
58		<font color="#1874cd">#include </font><font color="#4a708b"><NTL/ZZ_pXFactoring.h></font><br>
59		<font color="#1874cd">#include </font><font color="#4a708b"><NTL/ZZ_pEX.h></font><br>
	57	<p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
	58	<font color="#1773cc">#include </font><font color="#4a6f8b"><NTL/ZZ_pXFactoring.h></font><br>
	59	<font color="#1773cc">#include </font><font color="#4a6f8b"><NTL/ZZ_pEX.h></font><br>
60	60	<br>
61		NTL_CLIENT<br>
	61	<font color="#b02f60"><b>using</b></font> <font color="#008b00"><b>namespace</b></font> std;<br>
	62	<font color="#b02f60"><b>using</b></font> <font color="#008b00"><b>namespace</b></font> NTL;<br>
62	63	<br>
63	64	<font color="#008b00"><b>int</b></font> main()<br>
64	65	{<br>
65		ZZ_p::init(ZZ(<font color="#ff8c00">17</font>)); <font color="#0000ee"><i>// define GF(17)</i></font><br>
	66	ZZ_p::init(ZZ(<font color="#ff8b00">17</font>)); <font color="#0000ed"><i>// define GF(17)</i></font><br>
66	67	<br>
67	68	ZZ_pX P;<br>
68		BuildIrred(P, <font color="#ff8c00">10</font>); <font color="#0000ee"><i>// generate an irreducible polynomial P</i></font><br>
69		<font color="#0000ee"><i>// of degree 10 over GF(17)</i></font><br>
	69	BuildIrred(P, <font color="#ff8b00">10</font>); <font color="#0000ed"><i>// generate an irreducible polynomial P</i></font><br>
	70	<font color="#0000ed"><i>// of degree 10 over GF(17)</i></font><br>
70	71	<br>
71		ZZ_pE::init(P); <font color="#0000ee"><i>// define GF(17^10)</i></font><br>
	72	ZZ_pE::init(P); <font color="#0000ed"><i>// define GF(17^10)</i></font><br>
72	73	<br>
73		ZZ_pEX f, g, h;  <font color="#0000ee"><i>// declare polynomials over GF(17^10)</i></font><br>
	74	ZZ_pEX f, g, h;  <font color="#0000ed"><i>// declare polynomials over GF(17^10)</i></font><br>
74	75	<br>
75		random(f, <font color="#ff8c00">20</font>);  <font color="#0000ee"><i>// f is a random, monic polynomial of degree 20</i></font><br>
76		SetCoeff(f, <font color="#ff8c00">20</font>);<br>
	76	random(f, <font color="#ff8b00">20</font>);  <font color="#0000ed"><i>// f is a random, monic polynomial of degree 20</i></font><br>
	77	SetCoeff(f, <font color="#ff8b00">20</font>);<br>
77	78	<br>
78		random(h, <font color="#ff8c00">20</font>); <font color="#0000ee"><i>// h is a random polynomial of degree less than 20</i></font><br>
	79	random(h, <font color="#ff8b00">20</font>); <font color="#0000ed"><i>// h is a random polynomial of degree less than 20</i></font><br>
79	80	<br>
80		g = MinPolyMod(h, f); <font color="#0000ee"><i>// compute the minimum polynomial of h modulo f</i></font><br>
	81	g = MinPolyMod(h, f); <font color="#0000ed"><i>// compute the minimum polynomial of h modulo f</i></font><br>
81	82	<br>
82		<font color="#b03060"><b>if</b></font> (g == <font color="#ff8c00">0</font>) Error(<font color="#4a708b">"oops (1)"</font>); <font color="#0000ee"><i>// check that g != 0</i></font><br>
	83	<font color="#b02f60"><b>if</b></font> (g == <font color="#ff8b00">0</font>) Error(<font color="#4a6f8b">"oops (1)"</font>); <font color="#0000ed"><i>// check that g != 0</i></font><br>
83	84	<br>
84		<font color="#b03060"><b>if</b></font> (CompMod(g, h, f) != <font color="#ff8c00">0</font>) <font color="#0000ee"><i>// check that g(h) = 0 mod f</i></font><br>
85		Error(<font color="#4a708b">"oops (2)"</font>);<br>
	85	<font color="#b02f60"><b>if</b></font> (CompMod(g, h, f) != <font color="#ff8b00">0</font>) <font color="#0000ed"><i>// check that g(h) = 0 mod f</i></font><br>
	86	Error(<font color="#4a6f8b">"oops (2)"</font>);<br>
86	87	}<br>
87		</font>
88		</font></td></tr></table><p><p>
	88	</font></font></td></tr></table><p><p>
89	89	<!-- }}} ENDPRETTY -->
90	90
91	91

-2

doc/tour-ex6.html less more

6	6	<center>
7	7	<a href="tour-ex5.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
8	8	<a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
9		<img src="arrow3.gif" alt="[Next]" align=bottom>
	9	<a href="tour-ex7.html"><img src="arrow3.gif" alt="[Next]" align=bottom></a>
10	10	</center>
11	11
12	12	<h1>

34	34	<!-- STARTPLAIN
35	35	#include <NTL/RR.h>
36	36
	37	using namespace std;
	38	using namespace NTL;
	39
37	40	int main()
38	41	{
39	42	RR acc, val;

48	51	<!-- STARTPRETTY {{{ -->
49	52	<p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
50	53	<font color="#1773cc">#include </font><font color="#4a6f8b"><NTL/RR.h></font><br>
	54	<br>
	55	<font color="#b02f60"><b>using</b></font> <font color="#008b00"><b>namespace</b></font> std;<br>
	56	<font color="#b02f60"><b>using</b></font> <font color="#008b00"><b>namespace</b></font> NTL;<br>
51	57	<br>
52	58	<font color="#008b00"><b>int</b></font> main()<br>
53	59	{<br>

123	129	<center>
124	130	<a href="tour-ex5.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
125	131	<a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
126		<img src="arrow3.gif" alt="[Next]" align=bottom>
	132	<a href="tour-ex7.html"><img src="arrow3.gif" alt="[Next]" align=bottom></a>
127	133	</center>
128	134
129	135	</body>

+200

-0

doc/tour-ex7.html less more

	0	<html>
	1	<head>
	2	<title>
	3	A Tour of NTL: Examples: Thread Pools</title>
	4	</head>
	5
	6	<center>
	7	<a href="tour-ex6.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
	8	<a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
	9	<img src="arrow3.gif" alt="[Next]" align=bottom>
	10	</center>
	11
	12	<h1>
	13	<p align=center>
	14	A Tour of NTL: Examples: Thread Pools
	15	</p>
	16	</h1>
	17
	18	<p> <hr> <p>
	19
	20	If you have built NTL with <tt>NTL_THREAD_BOOST=on</tt>,
	21	then not only is NTL thread safe, but certain parts
	22	of NTL are designed to use multiple threads to speed things
	23	up.
	24	To implement this, NTL makes use of a <i>thread pool</i>,
	25	which is a collection of threads that are created once
	26	and then used over and over again, to avoid the significant
	27	overhead of thread creation and destruction.
	28	You can also use this same thread pool to speed up
	29	NTL client code.
	30	<p>
	31	To use this feature, you have to include the header file
	32	<tt>NTL/BasicThreadPool.h</tt>.
	33	In your main program, you should also indicate how many threads
	34	you want in the pool.
	35	If you want, say, 8 threads, you so this by calling the function
	36	<tt>SetNumThreads(8)</tt>.
	37	<p>
	38	If you do this, then certain parts of NTL will use these
	39	threads when possible (this is a working in progress).
	40	To use these threads in your own code, the easiest way
	41	to do this is with a <i>parallel for loop</i>,
	42	illustrated in the following example.
	43
	44	See <a href="BasicThreadPool.cpp.html"><tt>BasicThreadPool.txt</tt></a>
	45	for more details.
	46
	47	Consider the following routine:
	48
	49
	50	<!-- STARTPLAIN
	51	void mul(ZZ x, const ZZ a, const ZZ *b, long n)
	52	{
	53	for (long i = 0; i < n; i++)
	54	mul(x[i], a[i], b[i]);
	55	}
	56	ENDPLAIN -->
	57	<!-- STARTPRETTY {{{ -->
	58	<p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
	59	<font color="#008b00"><b>void</b></font> mul(ZZ x, <font color="#008b00"><b>const</b></font> ZZ a, <font color="#008b00"><b>const</b></font> ZZ *b, <font color="#008b00"><b>long</b></font> n) <br>
	60	{<br>
	61	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = <font color="#ff8b00">0</font>; i < n; i++)<br>
	62	mul(x[i], a[i], b[i]);<br>
	63	}<br>
	64	</font></font></td></tr></table><p><p>
	65	<!-- }}} ENDPRETTY -->
	66
	67
	68
	69
	70
	71
	72	<p>
	73	We can parallelize it as follows:
	74
	75	<!-- STARTPLAIN
	76	void mul(ZZ x, const ZZ a, const ZZ *b, long n)
	77	{
	78	NTL_EXEC_RANGE(n, first, last)
	79
	80	for (long i = first; i < last; i++)
	81	mul(x[i], a[i], b[i]);
	82
	83	NTL_EXEC_RANGE_END
	84	}
	85	ENDPLAIN -->
	86	<!-- STARTPRETTY {{{ -->
	87	<p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
	88	<font color="#008b00"><b>void</b></font> mul(ZZ x, <font color="#008b00"><b>const</b></font> ZZ a, <font color="#008b00"><b>const</b></font> ZZ *b, <font color="#008b00"><b>long</b></font> n) <br>
	89	{<br>
	90	NTL_EXEC_RANGE(n, first, last) <br>
	91	<br>
	92	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++)<br>
	93	mul(x[i], a[i], b[i]);<br>
	94	<br>
	95	NTL_EXEC_RANGE_END<br>
	96	}<br>
	97	</font></font></td></tr></table><p><p>
	98	<!-- }}} ENDPRETTY -->
	99
	100
	101
	102
	103	<p>
	104	<tt>NTL_EXEC_RANGE</tt> and
	105	<tt>NTL_EXEC_RANGE_END</tt> are macros that just <i>do the right
	106	thing</i>. If there are <i>nt</i> threads available, the interval
	107	[0..<i>n</i>) will be
	108	partitioned into (up to) <i>nt</i> subintervals, and a different thread will be
	109	used to process each subinterval. You still have to write the for loop
	110	yourself: the macro just declares and initializes variables <i>first</i> and
	111	<i>last</i> (or whatever you want to call them) of type <tt>long</tt>
	112	that represent the
	113	subinterval [<i>first</i>..<i>last</i>) to be processed by one thread.
	114
	115
	116
	117	<p>
	118	Note that the current thread participates as one of the <i>nt</i> available
	119	threads, and that the current thread will wait for all participating threads
	120	to finish their task before proceeding.
	121
	122	<p>
	123	Withing the "body" of this construct, you can freely reference any variables
	124	that are visible at this point. This is implemented using the C++ lambda
	125	feature (capturing all variables by reference).
	126
	127	<p>
	128	This construct will still work even if threads are disabled, in which case
	129	it runs single-threaded with <i>first=0</i> and <i>last=n</i>.
	130
	131	<p>
	132	Note that the code within the <tt>EXEC_RANGE</tt>
	133	body could call other routines that
	134	themselves attempt to execute an <tt>EXEC_RANGE</tt>:
	135	if this happens, the latter
	136	<tt>EXEC_RANGE</tt> will detect this and run single-threaded.
	137
	138	<p>
	139	You may wish to do other things within the <tt>EXEC_RANGE</tt>
	140	body than just execute
	141	a loop. One thing you may want to do is to declare variables. Another
	142	thing you may want to do is setup a local context
	143	for a <tt>ZZ_p</tt> modulus (or
	144	other type of modulus).
	145	Here is an example of doing this:
	146
	147
	148	<!-- STARTPLAIN
	149	void mul(ZZ_p x, const ZZ_p a, const ZZ_p *b, long n)
	150	{
	151	ZZ_pContext context;
	152	context.save();
	153
	154	NTL_EXEC_RANGE(n, first, last)
	155
	156	context.restore();
	157
	158	for (long i = first; i < last; i++)
	159	mul(x[i], a[i], b[i]);
	160
	161	NTL_EXEC_RANGE_END
	162	}
	163	ENDPLAIN -->
	164	<!-- STARTPRETTY {{{ -->
	165	<p><p><table cellPadding=10px><tr><td><font color="#000000"><font face="monospace">
	166	<font color="#008b00"><b>void</b></font> mul(ZZ_p x, <font color="#008b00"><b>const</b></font> ZZ_p a, <font color="#008b00"><b>const</b></font> ZZ_p *b, <font color="#008b00"><b>long</b></font> n) <br>
	167	{<br>
	168	ZZ_pContext context;<br>
	169	context.save();<br>
	170	<br>
	171	NTL_EXEC_RANGE(n, first, last) <br>
	172	<br>
	173	context.restore();<br>
	174	<br>
	175	<font color="#b02f60"><b>for</b></font> (<font color="#008b00"><b>long</b></font> i = first; i < last; i++)<br>
	176	mul(x[i], a[i], b[i]);<br>
	177	<br>
	178	NTL_EXEC_RANGE_END<br>
	179	}<br>
	180	</font></font></td></tr></table><p><p>
	181	<!-- }}} ENDPRETTY -->
	182
	183
	184
	185
	186	<p>
	187	A lower-level set of tools is available, which allow for
	188	more fine-grained control.
	189	See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
	190	for more details.
	191
	192	<center>
	193	<a href="tour-ex6.html"><img src="arrow1.gif" alt="[Previous]" align=bottom></a>
	194	<a href="tour-examples.html"><img src="arrow2.gif" alt="[Up]" align=bottom></a>
	195	<img src="arrow3.gif" alt="[Next]" align=bottom>
	196	</center>
	197
	198	</body>
	199	</html>

-0

doc/tour-examples.html less more

33	33	<li> <a href="tour-ex4.html">Modular Arithmetic</a>
34	34	<li> <a href="tour-ex5.html">Extension Rings and Fields</a>
35	35	<li> <a href="tour-ex6.html">Floating Point Classes</a>
	36	<li> <a href="tour-ex7.html">Thread Pools</a>
36	37
37	38
38	39	</ol>

+21

-22

doc/tour-gmp.html less more

33	33	The speedup is most dramatic on x86 machines.
34	34
35	35	<p>
36		By default, NTL uses a long integer package derived
	36	As of version 9.6.3, NTL uses GMP by default.
	37	You can disable GMP by passing <tt>NTL_GMP_LIP=off</tt>
	38	as an option to NTL's <tt>configure</tt> script.
	39	If you disable the use of GMP,
	40	NTL uses a long integer package derived
37	41	from Arjen Lenstra's LIP package.
38		However, for extra speed, it is recommended to use GMP.
39		Building NTL with GMP
40		takes a few extra minutes work,
41		and you certainly do not need to use NTL with GMP if you don't want to.
42		As far as I know, GMP is only available on Unix systems
43		and on Windows systems using Cygwin tools.
	42	This is not recommended: GMP is much faster.
44	43
45	44	<p>
46	45	Even if you do not use GMP,

51	50	<p>
52	51	<b>Note:</b> GMP is thread safe, so you should feel free to use it
53	52	in a thread-safe build of NTL.
54		However, the current version (v6)
	53	However, the current version of GMP (v6.1)
55	54	is <i>not</i> entirely exception friendly (it may
56	55	abort a running program, but only in some very extreme and
57	56	unusal circumstances).

62	61	<h2>
63	62	Downloading and building GMP
64	63	</h2>
	64	<p>
	65
	66	Many unix distributions now include GMP by default.
	67	But if not, it is pretty easy to install it directly from source,
	68	as follows.
65	69	<p>
66	70
67	71	Download GMP from <a href="http://gmplib.org">here.</a>

103	107	</h2>
104	108	<p>
105	109
106		When building NTL with GMP, you have to tell NTL that you want to
107		use GMP as the long integer package,
108		and where the include files and library are.
109		The easiest way to do this is by passing the argument
110		<tt>NTL_GMP_LIP=on</tt> to the NTL configuration script
111		when you are <a href="tour-unix.html">installing NTL</a>.
112		Assuming you installed GMP in <tt>$HOME/sw</tt> as above,
	110	When you are <a href="tour-unix.html">installing NTL</a>,
	111	if you installed GMP in <tt>$HOME/sw</tt> as above,
113	112	and you also want to install NTL in <tt>$HOME/sw</tt>,
114	113	you execute:
115	114	<pre>
116		% ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
	115	% ./configure PREFIX=$HOME/sw GMP_PREFIX=$HOME/sw
117	116	</pre>
118	117	You can write this more simply as
119	118	<pre>
120		% ./configure DEF_PREFIX=$HOME/sw NTL_GMP_LIP=on
	119	% ./configure DEF_PREFIX=$HOME/sw
121	120	</pre>
122	121	Here, <tt>DEF_PREFIX</tt> is a variable that is used
123	122	to specify the location of all software,

129	128	standard system directory where your compiler will look by default)
130	129	then simply
131	130	<pre>
132		% ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on
	131	% ./configure PREFIX=$HOME/sw
133	132	</pre>
134	133	does the job.
135	134	Moreover, if NTL is also to be installed in <tt>/usr/local</tt>,
136	135	then
137	136	<pre>
138		% ./configure NTL_GMP_LIP=on
	137	% ./configure
139	138	</pre>
140	139	does the job.
141	140

156	155	then just <tt>-lgmp</tt> does the job.
157	156	Note that <tt>-lgmp</tt> must come <i>after</i> <tt>-lntl</tt>
158	157	on the command line.
159		Finally, if NTL is installed
160		a shared libraries, then you don't even need <tt>-lgmp</tt>.
	158	Finally, if NTL and GMP are installed
	159	as shared libraries, then you don't even need <tt>-lgmp</tt>.
161	160
162	161
163	162	<p>
164	163	NTL has been tested and works correctly with GMP versions 3.1, 3.1.1,
165		4.1.4, and 5.1 (among others).
	164	4.1.4, 5.1, 6.0, and 6.1 (among others).
166	165	It is not possible to use versions of GMP prior to 3.1 with NTL.
167	166
168	167	<p>

+65

-52

doc/tour-impl.html less more

186	186	to doubles produce <i>exact</i> results, provided the inputs and outputs
187	187	are less than <tt>2^p</tt> in absolute value,
188	188	<li>
189		if <tt>y/2 <= x <= 2y</tt>, then <tt>x-y</tt> is computed exactly.
	189	assuming no overflow, <tt>x - long(x)</tt> produces an exact result for nonnegative <tt>x</tt>.
190	190	</ul>
191		Also, NTL allows the compiler to compute <tt>z = x/y</tt> as
192		<tt>t = 1/y</tt>, <tt>z = t*x</tt>.
193	191
194	192	<p>
195	193	It is also generally assumed that the compiler does not

209	207	Unfortunately, some compilers do not do this correctly,
210	208	unless you tell them.
211	209	With Intel's C compiler <tt>icc</tt>, for example,
212		you should compile NTL with the flag <tt>-fp-model source</tt>
	210	you should compile NTL with the flag <tt>-fp-model strict</tt>
213	211	to enforce strict adherence to floating point standards.
	212	That said, some effort has been made to ensure that NTL
	213	works correctly even if the compiler does perform such
	214	regrouping, including replacement of <tt>x/y</tt>
	215	by <tt>x*(1/y)</tt>.
	216
	217	<p>
214	218	Also, you should be wary of compiling using an optimization
215	219	level higher than the default <tt>-O2</tt> --
216	220	this may break some floating point assumptions (and maybe
217	221	some other assumptions as well).
218	222
219
	223	<p>
	224	In any case, programs that compile against NTL header files
	225	should compile correctly, even under very aggressive optimizations.
220	226
221	227	<p>
222	228	One big problem with the IEEE standard is that it allows intermediate

246	252	Hopefully, because of the newer SSE instructions, this whole strict/loose
247	253	issue is a thing of the past.
248	254
	255	<p>
	256	Another problem is that some hardware (especially newer Intel chips)
	257	support fused multiply-add (FMA) instructions.
	258	Again, this is only a problem for <tt>quad_float</tt>, and some
	259	care is taken to detect the problem and to work around it.
	260	The rest of NTL will work fine regardles.
	261
249	262
250	263
251	264	<p>

264	277	and "not a number" are implemented correctly.
265	278
266	279
267		<p>
268		<h3>Implementing long integer arithmetic</h3>
269		<p>
270		There are two basic strategies for implementing long integer arithmetic.
271
272		<p>
273		The default strategy is implemented in the
274		<i>traditional long integer arithmetic package</i>.
275		This package is derived from the LIP package originally developed by
276		A. K. Lenstra, although it has evolved quite a bit within NTL.
277		This package uses no assembly code and is very portable.
278
279		<p>
280		The alternative strategy is to use GMP in place of LIP.
281		In this strategy, the representation of long integers is in a
282		form compatible with GMP.
283		This strategy typically yields the best performance,
284		but requires
285		that GMP is installed on your platform.
286
287		<p>
288		<a href="tour-gmp.html">Go here</a> for more details on the use
289		of GMP with NTL.
	280
290	281
291	282	<p>
292	283	<h3>Algorithms</h3>

324	315	</h3>
325	316	<p>
326	317	As of v7.0, NTL is thread safe.
327		That said, there are two things to be aware of:
	318	That said, there are several things to be aware of:
328	319	<ul>
329		<li>
330		While extreme care has been taken with the design and implementation
331		of thread safety, this feature is still very new and not been
332		subjected to a lot of testing.
333		<li>
334		<p>
	320
	321	<li>
335	322	To use this feature, you have to enable <tt>NTL_THREADS</tt>
336	323	in the configuration script.
337	324	Also, you will need a compiler and runtime library that
338	325	implements several key <tt>C++11</tt> features,
339	326	including <tt>thread_local</tt> storage.
340
341		<p>
342		<i>
343		As I wrote this in Nov. 2014, there were very few compilers
344		that satisfy the requirements.
345		I was successfully able to build and test NTL with threads
346		using gcc 4.9.2 on a Linux system.
347		I have not been able to do so on any current Mac OSX system:
348		in fact, no compiler on current Mac systems (gcc or clang)
349		yet have proper support for <tt>thread_local</tt> storage.
350		</i>
351
352		<p>
353		<li>
354		NTL remains thread safe when built with GMP.
355		however, the current version (v1.1) of the external gf2x
	327	<ul>
	328	<li>
	329	NOTE: as of v9.8, the requirements have been relaxed, so that
	330	for gcc and gcc-compatible compilers
	331	(such as clang and icc) only support of the gcc <tt>__thread</tt>
	332	storage specifier is required.
	333	<li>
	334	With these relaxed requirements, it is possible to build
	335	a thread safe version of NTL on Linux using gcc 4.8 and above,
	336	or on Mac OSX 10.10 and above.
	337
	338	</ul>
	339
	340	<p> <li>
	341	You must build NTL using GMP (i.e., configure with <tt>NTL_GMP_LIP=on</tt>).
	342	The classic LIP integer arithmetic is not thread safe: it could
	343	be made so, but it is not a priority at this time.
	344
	345	<p>
	346	<li>
	347	The current version (v1.1) of the external gf2x
356	348	library is not thread safe.
357		Therefore, <b>you should build NTL using gf2x if you need a thread-safe
	349	Therefore, <b>you should NOT build NTL using gf2x if you need a thread-safe
358	350	build</b>.
359	351	</ul>
360	352

415	407
416	408	<p>
417	409	<h3>
	410	Thread Boosting
	411	</h3>
	412	<p>
	413
	414	As of v9.5.0, NTL provides a <i>thread boosting</i> feature.
	415	With this feature, certain code within NTL will use available
	416	threads to speed up computations on a multicore
	417	machine.
	418	This feature is enabled by setting <tt>NTL_THREAD_BOOST=on</tt>
	419	during configuration.
	420	See <a href="BasicThreadPool.cpp.html">BasicThreadPool.txt</a>
	421	for more information.
	422
	423	<p>
	424	This feature is a work in progress.
	425	Currently, basic <tt>ZZ_pX</tt> arithmetic has been thread boosted.
	426	More code will be boosted later.
	427
	428
	429	<p>
	430	<h3>
418	431	Error Handling and Exceptions
419	432	</h3>
420	433	<p>

+11

-0

doc/tour-modules.html less more

80	80	<p>
81	81
82	82	<table >
	83
	84
	85
	86	<!-- ----------- BasicThreadPool.txt ----------- -->
	87	<p><tr valign=top> <td> <b>
	88	<a href="BasicThreadPool.cpp.html"><tt>BasicThreadPool</tt></a>
	89
	90	</b> <td>
	91
	92	class <tt>BasicThreadPool</tt>: a simple thread pool;
	93	plus additional <i>thread boosting</i> features
83	94
84	95
85	96

+57

-58

doc/tour-time.html less more

19	19	<p> <hr> <p>
20	20
21	21	Here are some timing figures from using NTL.
22		They were obtained using NTL 8.1 compiled with <tt>g++</tt> 4.2.1
23		and with GMP 5.1 on a 2.8GHz Intel Core 2 Duo running on Max OSX 10.6.8.
	22	They were obtained using NTL 9.9.0 compiled with <tt>g++</tt> 4.8.5
	23	and with GMP 6.1 on a 2.3GHz Intel Haswell processor
	24	(E5-2698 v3) running Linux.
24	25
25	26	<p>
26	27	All times are ins <i>seconds</i>.

31	32	generate the same data.
32	33
33	34	<p>
	35	NOTE: that the PRG changed in v9.4.0, so there may be
	36	some inconsistencies.
	37
	38	<p>
34	39	<pre>
35	40
36		multiply 1000-bit ints: 4.47942e-07
37		remainder 2000/1000-bit ints: 8.3923e-07
38		gcd 1000-bit ints: 1.07981e-05
39		multiply degree-1000 poly mod 1000-bit prime: 0.0140632
40		remainder degree-2000/1000 poly mod 1000-bit prime: 0.0386337
41		preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.0143691
42		gcd degree-1000 poly mod 1000-bit prime: 0.338774
43		multiply degree-1000 int poly with 1000-bit coeffs: 0.0162582
	41	multiply 1000-bit ints: 1.76284e-07
	42	remainder 2000/1000-bit ints: 3.60535e-07
	43	gcd 1000-bit ints: 2.87045e-06
	44	multiply degree-1000 poly mod 1000-bit prime: 0.00432981
	45	remainder degree-2000/1000 poly mod 1000-bit prime: 0.0125583
	46	preconditioned remainder degree-2000/1000 poly mod 1000-bit prime: 0.00443356
	47	gcd degree-1000 poly mod 1000-bit prime: 0.122722
	48	multiply degree-1000 int poly with 1000-bit coeffs: 0.00812543
44	49
45	50	factoring degree-1000 poly mod 1000-bit prime...
46		square-free decomposition...0.339693
	51	square-free decomposition...0.122685
47	52	factoring multiplicity 1, deg = 1000
48		computing X^p...22.9192
49		computing DDF...generating baby steps...+++++++++++++++++++++16.9463
50		generating giant steps...++++++++++++++++++++++17.695
51		giant refine...++++split 1 43
52		split 2 38
53		split 3 64
54		*++++split 5 108
55		*++++split 11 237
56		split 12 510
57		*giant refine time: 8.19123
58		baby refine...split 3 6
59		split 6 6
60		split 9 9
61		split 22 22
62		split 38 38
63		split 64 64
64		split 108 108
65		split 237 237
66		split 248 248
67		split 262 262
68		baby refine time: 0.611474
69		DDF time: 43.4528
70		computing EDF(3,2)...+0.077447
71		...total time = 66.8182
	53	computing X^p...7.23809
	54	computing DDF...generating baby steps...+++++++++++++++++++++6.21623
	55	generating giant steps...++++++++++++++++++++++6.49462
	56	giant refine...++++split 1 1
	57	split 2 26
	58	++++++++++++++++++split 0 973
	59	giant refine time: 4.8139
	60	baby refine...split 1 1
	61	split 26 26
	62	split 973 973
	63	baby refine time: 3.3e-05
	64	DDF time: 17.5262
	65	...total time = 24.8965
72	66
73		multiply 500-bit GF2Xs: 1.03529e-06
74		remainder 1000/500-bit GF2Xs: 6.06942e-06
75		gcd 500-bit GF2Xs: 1.19302e-05
	67	multiply 500-bit GF2Xs: 5.54208e-08
	68	remainder 1000/500-bit GF2Xs: 8.40658e-07
	69	gcd 500-bit GF2Xs: 3.60963e-06
76	70
77		factoring degree-500 GF2X: 0.0010868
78		gcd 500-bit GF2X: 1.18877e-05
79		multiply degree-500 poly mod 500-bit GF2X: 0.024646
80		remainder degree-1000/500 poly mod 500-bit GF2X: 0.0884258
81		preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.049274
82		gcd degree-500 poly mod 500-bit GF2X: 0.555096
	71	factoring degree-500 GF2X: 0.00015574
	72	gcd 500-bit GF2X: 3.61365e-06
	73	multiply degree-500 poly mod 500-bit GF2X: 0.00251375
	74	remainder degree-1000/500 poly mod 500-bit GF2X: 0.00905957
	75	preconditioned remainder degree-1000/500 poly mod 500-bit GF2X: 0.00505149
	76	gcd degree-500 poly mod 500-bit GF2X: 0.0478557
83	77
84	78	factoring degree-500 poly mod 500-bit GF2X...
85		square-free decomposition...0.038657
	79	square-free decomposition...0.004635
86	80	factoring multiplicity 1, deg = 250
87		computing X^p...4.93964
88		computing DDF...generating baby steps...++++++++++3.51054
89		generating giant steps...+++++++++++3.78422
90		giant refine...++++*++++split 6 59
91		split 7 68
92		*split 0 123
93		giant refine time: 2.59123
94		baby refine...split 59 59
95		split 68 68
96		split 123 123
97		baby refine time: 3.4e-05
98		DDF time: 9.88619
	81	computing X^p...0.488941
	82	computing DDF...generating baby steps...++++++++++0.332162
	83	generating giant steps...+++++++++++0.357681
	84	giant refine...++++split 1 9
	85	split 2 13
	86	split 4 44
	87	*++++split 7 73
	88	*split 0 111
	89	giant refine time: 0.233787
	90	baby refine...split 9 9
	91	split 13 13
	92	split 44 44
	93	split 73 73
	94	split 111 111
	95	baby refine time: 0.001275
	96	DDF time: 0.924938
99	97
100		...total time = 14.879
	98	...total time = 1.41792
	99
101	100
102	101
103	102	</pre>

+10

-30

doc/tour-tips.html less more

19	19	<p> <hr> <p>
20	20
21	21	<ol>
22
23		<li>
24		Build NTL using GMP as the long integer package.
25		This is extremely important, as the GMP implementation
26		of long integer arithmetic is <i>much</i> faster
27		than the default implementation.
28		Go <a href="tour-gmp.html">here</a> for details.
29
30		<p>
31		<li>
32		On many machines that optionally offer 64-bit integer arithmetic
33		(recent Mac OSX machines, for instance),
34		you should
35		compile using <tt>gcc</tt> with the option <tt>-m64</tt>
36		to get the full benefit.
37		To do this,
38		pass <tt>"CFLAGS=-O2 -m64"</tt>
39		to the <tt>configure</tt> script (note the use of quotes).
40		If you are using NTL with GMP on such a machine,
41		you <i>must</i> do this to get compatible code.
42		Note, however, that 64-bit is becoming the default, so this
43		may not be necessary.
44
45		<p>
46		<li>
47		On Sparcs,
48		pass the argument <tt>"CFLAGS=-O2 -mcpu=v8"</tt>
49		to the <tt>configure</tt> script.
50		On more recent, 64-bit sparcs, pass <tt>"CFLAGS=-O2 -mcpu=v9 -m64"</tt>
51		to get the full instruction set and 64-bit code.
52	22
53	23	<p>
54	24	<li>

123	93	<tt>a[i]*b[i]</tt>, in every loop iteration.
124	94	The second does not.
125	95
	96	<p>
	97	NOTE: actually, for the class <tt>ZZ</tt>, there is a
	98	special function <tt>MulAddTo</tt>, with whic one can write
	99	the loop body simply as
	100	<pre>
	101	MulAddTo(res, a[i], b[i]);
	102	</pre>
	103
126	104
127	105
128	106

133	111	If you <i>must</i> switch the modulus often,
134	112	use the class <tt>ZZ_pContext</tt> to save the information
135	113	associated with the modulus (see <a href="ZZ_p.cpp.html">ZZ_p.txt</a>).
	114	The same holds for analogous classes, such as <tt>zz_p</tt>
	115	and <tt>GF2E</tt>.
136	116
137	117
138	118

+90

-144

doc/tour-unix.html less more

37	37	% gunzip ntl-xxx.tar.gz
38	38	% tar xf ntl-xxx.tar
39	39	% cd ntl-xxx/src
40		% ./configure PREFIX=$HOME/sw
	40	% ./configure
41	41	% make
42	42	% make check
43	43	% make install
44	44	</pre>
45	45
46		This will build, test, and install NTL in <tt>$HOME/sw</tt>.
47		Of course, change <tt>$HOME/sw</tt> to whatever you want (the default is
48		<tt>/usr/local</tt>).
49		You will find the NTL header files in <tt>$HOME/sw/include/NTL</tt>
	46	This will build, test, and install NTL in
	47	<tt>/usr/local</tt>.
	48	For this to work, GMP must already be installed
	49	(most Unix distributions already come with GMP installed,
	50	but see <a href="tour-gmp.html">this page</a> for more
	51	details).
	52	If you really do not want to use GMP,
	53	you can pass the option
	54	<tt>NTL_GMP_LIP=off</tt>
	55	to <tt>configure</tt>.
	56
	57	<p>
	58	After installation,
	59	you will find the NTL header files in <tt>/usr/local/include/NTL</tt>
50	60	and the compiled binary
51		in <tt>$HOME/sw/lib/libntl.a</tt>
	61	in <tt>/usr/local/lib/libntl.a</tt>
52	62	(this is a <i>static</i> library -- if you want a <i>shared</i>
53	63	library, <a href="#shared">see below</a>).
54		<p>
55		If you really are interested in high-performace, you will
56		<i>definitely</i> want to build NTL
57		using GMP (the GNU Multi-Precision package).
58		If GMP has already been installed in a standard
59		place, like <tt>/usr/local</tt>, then invoke <tt>configure</tt>
60		above as
61		<pre>
62		% ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on
63		</pre>
64		and if GMP is installed somewhere else, say <tt>$HOME/sw</tt>, then
65		either
66		<pre>
67		% ./configure PREFIX=$HOME/sw NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
68		</pre>
69		or, more simply,
70		<pre>
71		% ./configure DEF_PREFIX=$HOME/sw NTL_GMP_LIP=on
72		</pre>
73		does the job.
74		Here, <tt>DEF_PREFIX</tt> is a variable that is used
75		to specify the location of all software,
76		and it defaults to <tt>/usr/local</tt>.
77		<a href="tour-gmp.html">This page</a> provides more
78		details.
	64	Documentation is in <tt>/usr/local/share/doc</tt>,
	65	with the main entry-point at <tt>/usr/local/share/doc/tour.html</tt>.
	66
	67
79	68
80	69	<p>
81	70	If you want very high-performance for polynomial arithmetic
82	71	over <i>GF(2)</i>, you may want to consider using the <tt>gf2x</tt> library.
83		To do this, <tt>gf2x</tt> must already be installed somewhere.
	72	To do this, <tt>gf2x</tt> must already be installed.
84	73	In addition, you should invoke <tt>configure</tt>
85	74	with the option <tt>NTL_GF2X_LIB=on</tt>.
86		If <tt>gf2x</tt> is installed in a standard location, this is
87		all you need to do;
88		otherwise, if <tt>gf2x</tt> is installed, say, in <tt>$HOME/sw</tt>,
89		then you also need to pass the option <tt>GF2X_PREFIX=$HOME/sw</tt>.
90	75	<a href="tour-gf2x.html">This page</a> provides more details.
91	76
92		<p>
93		Even if you don't want to experiment with the <tt>gf2x</tt>
94		library, you might want to try setting <tt>NTL_PCLMUL=on</tt>,
95		which will enable the use special hardware support for fast
96		polynomial arithmetic over <i>GF(2)</i> on platforms that support it
97		(the configure script will check that it actually works).
98		You can set <tt>NTL_PCLMUL=on</tt> even if you also set
99		<tt>NTL_GF2X_LIB=on</tt>, but it probably won't help much.
	77
	78
	79	<p>
	80	If you want to install NTL install NTL somewhere besides <tt>/usr/local</tt>,
	81	pass the option <tt>PREFIX=/path/to/install/ntl</tt> to
	82	<tt>configure</tt>.
	83	If GMP is installed somewhere besides <tt>/usr/local</tt>,
	84	pass the optopn
	85	<tt>GMP_PREFIX=/path/to/gmp</tt>
	86	to <tt>configure</tt>.
	87	You can also pass
	88	<tt>GF2X_PREFIX=/path/to/gf2x</tt>
	89	to <tt>configure</tt>,
	90	if <tt>gf2x</tt> is installed somewhere besides <tt>/usr/local</tt>.
	91	As a shorthand, you pass the option
	92	<tt>DEF_PREFIX=/path/to/all/software</tt>, which will
	93	override the default for <tt>PREFIX</tt>,
	94	<tt>GMP_PREFIX</tt>, and <tt>GF2X_PREFIX</tt>.
	95
100	96
101	97
102	98	<p>
103	99	Now suppose you want to compile a program that uses NTL.
104		Suppose you are working in some directory and <tt>foo.c</tt>
	100	Suppose you are working in some arbitrary directory and <tt>foo.c</tt>
105	101	is your program.
106		Assume that you have installed NTL in <tt>$HOME/sw</tt> as above.
	102	Assume that you have installed NTL in <tt>/usr/local</tt> as above.
107	103	The following should work:
108	104	<pre>
109		% g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lm
110		</pre>
111		If you are using GMP, then:
112		<pre>
113		% g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lgmp -lm
114		</pre>
115		If you are using GMP and <tt>gf2x</tt>, then
116		<pre>
117		% g++ -I$HOME/sw/include foo.c -o foo -L$HOME/sw/lib -lntl -lgmp -lgf2x -lm
118		</pre>
119
	105	% g++ -g -O2 foo.c -o foo -lntl -lgmp -lm
	106	</pre>
	107	If you have installed NTL and/or GMP is a non-standard location,
	108	say <tt>/path/to/sw</tt>,
	109	then:
	110	<pre>
	111	% g++ -g -O2 -I/path/to/sw/include foo.c -o foo -L/path/to/sw/lib -lntl -lgmp -lm
	112	</pre>
	113	If you build NTL with <tt>gf2x</tt>, just add the option
	114	<tt>-lgf2x</tt> to the above, right after <tt>-lgmp</tt>.
	115
	116	<p>
	117	If you are working in the NTL <tt>src</tt> directory itself,
	118	you can just run:
	119	<pre>
	120	% make foo
	121	</pre>
	122	to compile a program <tt>foo.c</tt>, as above.
120	123
121	124	<p>
122	125	<h2>

134	137	<pre>
135	138	% gunzip ntl-xxx.tar.gz
136	139	% tar xvf ntl-xxx.tar
	140	</pre>
	141
	142	On most systems, the following shortcut works:
	143	<pre>
	144	% tar xzvf ntl-xxx.tar.gz
137	145	</pre>
138	146
139	147	<p>

180	188	<pre>
181	189
182	190	CXX=g++ # The C++ compiler
	191
183	192	CXXFLAGS=-g -O2 # C++ complilation flags
184	193
185		DEF_PREFIX=/usr/local # Default software directory
	194	NATIVE=on # Compiles code targeted to the current hardware
	195
	196	DEF_PREFIX=/usr/local# Default software directory
	197
186	198	PREFIX=$(DEF_PREFIX) # Directory in which to install NTL library components
187	199	SHARED=off # Generate a shared library (as well as static)
188	200
189	201	NTL_THREADS=off # compile in thread-safe mode
	202	NTL_THREAD_BOOST=off # compile with thread boosting enabled
190	203	NTL_EXCEPTIONS=off # compile with exceptions enabled
191	204
192		NTL_GMP_LIP=off # Switch to enable the use of GMP as primary
	205	NTL_GMP_LIP=on # Switch to enable the use of GMP as primary
193	206	# long integer package
194	207
195	208	GMP_PREFIX=$(DEF_PREFIX) # Directory in which GMP components are installed
196
197		NTL_PCLMUL=off # switch to enable PCLMUL instruction for
198		# faster arithmetic in GF(2)[X]
199	209
200	210	NTL_GF2X_LIB=off # Switch to enable the use of the gf2x package
201	211	# for faster arithmetic GF(2)[X]

228	238
229	239	<p>
230	240	<li>
231		If you want to use, say, the options <tt>-g</tt> and <tt>-O</tt> for
232		compiling <tt>C</tt> and <tt>C++</tt>, run:
233		<pre>
234		% ./configure "CXXFLAGS=-g -O"
235		</pre>
236		Note the use of quotes to keep the argument in one piece.
237		Also note that the configuration script will sometimes
238		automatically adjust <tt>CXXFLAGS</tt>, depending on
239		other configuration flags that are set
240		(specifically, the <tt>NTL_PCLMUL</tt>, <tt>NTL_THREADS</tt>,
241		and <tt>NTL_EXCEPTIONS</tt> flags).
242		However, these automatic adjustments <i>will not</i>
243		be done if you explicitly set <tt>CXXFLAGS</tt> yourself:
244		you are on your own, then.
245
246
247		<p>
248		<li>
249		If <a href="tour-gmp.html">GMP (the GNU Multi-Precision package)</a>
250		is installed in a standard system directory, and you want to use it
251		to obtain better performance for long integer arithemtic, run:
252		<pre>
253		% ./configure NTL_GMP_LIP=on
254		</pre>
	241	If you want to use, say, the options <tt>-g</tt>, <tt>-O3</tt>,
	242	compiling <tt>C++</tt>, run:
	243	<pre>
	244	% ./configure "CXXFLAGS=-g -O3"
	245	</pre>
	246
	247
	248	<p>
255	249	If GMP was installed in
256	250	<tt>$HOME/sw</tt>,
257	251	run:
258	252	<pre>
259		% ./configure NTL_GMP_LIP=on GMP_PREFIX=$HOME/sw
	253	% ./configure GMP_PREFIX=$HOME/sw
260	254	</pre>
261	255	Go <a href="tour-gmp.html">here</a> for complete details.
262	256

293	287	description.
294	288
295	289	<p>
296		Note that all of these configuration options can also be set
	290	Note that many of these configuration options can also be set
297	291	by editing the two files <tt>makefile</tt>
298	292	and <tt>../include/NTL/config.h</tt> by hand.
299	293	These files are fairly simple and well documented, and so this is not

305	299	and that the file "<tt>def_makefile</tt>"
306	300	contains a backup copy of the original <tt>makefile</tt> file.
307	301
308		<p>
309		This command is intended only as a convenience
310		and -- more importantly -- to allow the configuration process
311		to be script driven.
312		This script does not perform any "magic", like finding out what
313		the local C compiler is called, etc.
314		If the defaults are not
315		correct for your platform, you have to set an appropriate variable.
316	302
317	303
318	304

344	330
345	331	<p>
346	332	<li>
347		A script is run that "automagically"
348		determines the best way to write a timing function
349		on your platform.
350		It tries different routines in the files <tt>GetTime1.c</tt>,
351		<tt>GetTime2.c</tt>, etc., and when it finds a good one,
352		it copies the file into <tt>GetTime.c</tt>.
353		A similar script is run to "automagically"
354		determine if there is something like a <tt>getpid</tt>
355		function available on your platform.
356
	333	Several scripts are run to obtain more information
	334	about your system (e.g.,
	335	to find a timing function, a "getpid" function,
	336	and to detect if things like Intel AVX intrinsics work).
357	337
358	338
359	339	<p>
360	340	<li>
361	341	The file "<tt>../include/NTL/gmp_aux.h</tt>"
362	342	is generated for use with GMP.
363		If not using GMP, this files are still created, but it is empty.
	343	If not using GMP, this file is still created, but it is empty.
364	344
365	345
366	346	<p>

381	361	<pre>
382	362	NTL_LONG_LONG NTL_AVOID_FLOAT NTL_TBL_REM NTL_TBL_REM_LL NTL_AVOID_BRANCHING
383	363	NTL_SPMM_ULL NTL_SPMM_ASM NTL_GF2X_NOINLINE NTL_GF2X_ALTCODE
384		NTL_GF2X_ALTCODE1 NTL_FFT_LAZYMUL NTL_FFT_BIGTAB
	364	NTL_GF2X_ALTCODE1 NTL_FFT_LAZYMUL NTL_FFT_BIGTAB NTL_PCLMUL
385	365	</pre>
386	366	which are set by the wizard.
387	367	Also note that if you <i>do not</i> want the wizard to run,

422	402	copies a number of files to a directory <tt><prefix></tt> that you
423	403	specify by passing <tt>PREFIX=<prefix></tt>
424	404	as an argument to <tt>configure</tt> at configuration time,
425		or as an argument to <tt>make install</tt> at installation time.
	405	or as an argument to <tt>make install</tt> at installation time
	406	(e.g., <tt>make install PREFIX=/path/to/sw</tt>).
426	407	The default is <tt>/usr/local</tt>, so either you need root
427	408	permissions, or you choose a <tt><prefix></tt> for which
428	409	you have write permission.

450	431	<tt>ntl.a</tt>.
451	432	To rebuild after executing <tt>make clean</tt>, execute <tt>make ntl.a</tt>.
452	433
453
454		<p>
455		Assuming you have installed NTL as above,
456		to compile a program <tt>foo.c</tt> that uses NTL,
457		execute
458		<pre>
459		g++ -I<prefix>/include foo.c -o foo -L<prefix>/lib -lntl -lm
460		</pre>
461		This compiles <tt>foo.c</tt> as a <tt>C++</tt> program
462		and creates the binary <tt>foo</tt>.
463		<p>
464		If you built NTL using <a href="tour-gmp.html">GMP</a>, execute:
465		<pre>
466		g++ -I<prefix>/include foo.c -o foo -L<prefix>/lib -lntl -L<gmp_prefix>/lib -lgmp -lm
467		</pre>
468		<p>
469		Of course, if <tt><prefix></tt> and <tt><gmp_prefix></tt>
470		are the same, you do not need to duplicate the <tt>-L</tt>
471		flags, and if either are standard directories, like <tt>/usr/local</tt>,
472		you can leave out the corresponding <tt>-I</tt> and <tt>-L</tt>
473		flags altogether.
474		<p>
475		Similarly, if you built NTL using <a href="tour-gf2x.html"><tt>gf2x</tt></a>,
476		you should include flags
477		<pre>
478		-L<gf2x_prefix>/lib -lgf2x
479		</pre>
480		on the command line.
481		<p>
482		This works even if you are not working in the directory
483		in which you built NTL.
484		If you <i>are</i> working in that directory, you can just execute
485		<pre>
486		make foo
487		</pre>
488	434
489	435	<p>
490	436	<h2>

+33

-4

doc/tour-win.html less more

39	39	<p>
40	40	<b>Windows Users:</b>
41	41	you should consider using a Unix emulation environment like
42		<a href="http://www.mingw.org/">MinGW</a>
43		or <a href="https://www.cygwin.com/">Cygwin</a>, instead of
	42	<a href="https://www.cygwin.com/">Cygwin</a>
	43	or
	44	<a href="http://www.mingw.org/">MinGW</a>,
	45	instead of
44	46	Microsoft development tools.
	47	<p>
45	48	Why?
46	49	<ul>
47	50	<li>
48		MinGW uses gcc, which generally adheres closer to language
	51	These environments use gcc, which generally adheres closer to language
49	52	standards and produces more efficient code that Microsoft's
50	53	compiler.
51	54	<p><li>
52		With MinGW, you can use NTL's
	55	With these environments, you can use NTL's
53	56	<a href="tour-unix.html">Unix distribution</a>,
54	57	and the installation is almost entirely automatic:
55	58	no pointing and clicking -- not much more

58	61	and run NTL's performance-tuning Wizard.
59	62	These factors combined can make a hige difference in performance,
60	63	easily giving you a huge (10x or more) performance improvement.
	64	<p><li>
	65	On 64-bit machines, you should definitely consider Cygwin:
	66	the 64-bit version of Cygwin gives you an
	67	<a href="https://en.wikipedia.org/wiki/64-bit_computing">LP64 data model</a>,
	68	which for many reasons is preferable to the Windows data model.
	69	In particular, you will get the most performance out of NTL
	70	in this environment.
61	71	</ul>
62	72
63	73
64	74
	75
	76	<p>
	77	The remaining instructions on this page only apply
	78	if you <i>do not</i> use a Unix emulation environment
	79	like Cygwin or MinGW.
	80
	81	<p>
	82	If you really want to get the most out of NTL, please stop,
	83	and seriously consider using a Unix emulation environment
	84	and
	85	NTL's
	86	<a href="tour-unix.html">Unix distribution</a>.
	87	Your code will be much snappier, and your quality of life
	88	will be much better.
	89
	90	<p>
	91	You have been warned.
	92
65	93	<p>
66	94	<b>
67	95	Obtaining and unpacking NTL.
68	96	</b>
	97
69	98	<p>
70	99
71	100	To obtain the source code and documentation for NTL,

-1

doc/vec_GF2.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_GF2.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_GF2.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_GF2E.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_GF2E.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_GF2E.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_RR.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_RR.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_RR.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_ZZ.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_ZZ_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_ZZ_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_ZZ_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_ZZ_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_lzz_p.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_lzz_p.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_lzz_p.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/vec_lzz_pE.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vec_lzz_pE.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vec_lzz_pE.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

+14

-10

doc/vector.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/vector.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/vector.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

109	109	<font color="#0000ed"><i>that NTL's strategy.  However, the new "move semantics", introduced in C++11,</i></font><br>
110	110	<font color="#0000ed"><i>mitigate this issue somewhat.</i></font><br>
111	111	<br>
112		<font color="#0000ed"><i>Because of NTL's relocatability requirement, it is not recommended to</i></font><br>
113		<font color="#0000ed"><i>use NTL vectors over classes coming from the standard library, which</i></font><br>
114		<font color="#0000ed"><i>may not satisfy the requirement.</i></font><br>
	112	<font color="#0000ed"><i>Because of NTL's relocatability requirement, it is not recommended to use NTL</i></font><br>
	113	<font color="#0000ed"><i>vectors over classes coming from the standard library, which may not satisfy</i></font><br>
	114	<font color="#0000ed"><i>the requirement.  In those cases, you could either use an STL vector, or use an</i></font><br>
	115	<font color="#0000ed"><i>NTL vector and wrap the suspect classes in an NTL smart pointer of some kind</i></font><br>
	116	<font color="#0000ed"><i>(SmartPtr or OptionalVal).</i></font><br>
115	117	<br>
116	118	<font color="#0000ed"><i>Note also that Facebook's open source "folly" library also provides</i></font><br>
117	119	<font color="#0000ed"><i>a vector class that uses realloc in a manner very similar to NTL's vector class.</i></font><br>

198	200	<br>
199	201	T* elts();<br>
200	202	<font color="#008b00"><b>const</b></font> T* elts() <font color="#008b00"><b>const</b></font>;<br>
201		<font color="#0000ed"><i>// returns address of first vector element (or 0 if no space has</i></font><br>
202		<font color="#0000ed"><i>// been allocated for this vector).  If a vector potentially has</i></font><br>
203		<font color="#0000ed"><i>// length 0, it is safer to write v.elts() instead of &v[0].</i></font><br>
204		<font color="#0000ed"><i>// The first version is applied to non-const Vec<T>,</i></font><br>
205		<font color="#0000ed"><i>// and returns a non-const pointer to a T, while the second version</i></font><br>
206		<font color="#0000ed"><i>// is applied to a const Vec<T> and returns a const reference to a T.</i></font><br>
	203	<font color="#0000ed"><i>// returns address of first vector element (or 0 if no space has been</i></font><br>
	204	<font color="#0000ed"><i>// allocated for this vector).  If a vector potentially has length 0, it is</i></font><br>
	205	<font color="#0000ed"><i>// safer to write v.elts() instead of &v[0]: the latter is not well defined</i></font><br>
	206	<font color="#0000ed"><i>// by the C++ standard (although this is likely an academic concern).</i></font><br>
	207	<font color="#0000ed"><i>//</i></font><br>
	208	<font color="#0000ed"><i>// The first version is applied to non-const Vec<T>, and returns a non-const</i></font><br>
	209	<font color="#0000ed"><i>// pointer to a T, while the second version is applied to a const Vec<T> and</i></font><br>
	210	<font color="#0000ed"><i>// returns a const reference to a T.</i></font><br>
207	211	<br>
208	212	<br>
209	213	<font color="#008b00"><b>void</b></font> swap(Vec<T>& y);<br>

+13

-9

doc/vector.txt less more

101	101	that NTL's strategy. However, the new "move semantics", introduced in C++11,
102	102	mitigate this issue somewhat.
103	103
104		Because of NTL's relocatability requirement, it is not recommended to
105		use NTL vectors over classes coming from the standard library, which
106		may not satisfy the requirement.
	104	Because of NTL's relocatability requirement, it is not recommended to use NTL
	105	vectors over classes coming from the standard library, which may not satisfy
	106	the requirement. In those cases, you could either use an STL vector, or use an
	107	NTL vector and wrap the suspect classes in an NTL smart pointer of some kind
	108	(SmartPtr or OptionalVal).
107	109
108	110	Note also that Facebook's open source "folly" library also provides
109	111	a vector class that uses realloc in a manner very similar to NTL's vector class.

190	192
191	193	T* elts();
192	194	const T* elts() const;
193		// returns address of first vector element (or 0 if no space has
194		// been allocated for this vector). If a vector potentially has
195		// length 0, it is safer to write v.elts() instead of &v[0].
196		// The first version is applied to non-const Vec<T>,
197		// and returns a non-const pointer to a T, while the second version
198		// is applied to a const Vec<T> and returns a const reference to a T.
	195	// returns address of first vector element (or 0 if no space has been
	196	// allocated for this vector). If a vector potentially has length 0, it is
	197	// safer to write v.elts() instead of &v[0]: the latter is not well defined
	198	// by the C++ standard (although this is likely an academic concern).
	199	//
	200	// The first version is applied to non-const Vec<T>, and returns a non-const
	201	// pointer to a T, while the second version is applied to a const Vec<T> and
	202	// returns a const reference to a T.
199	203
200	204
201	205	void swap(Vec<T>& y);

-1

doc/version.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/version.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/version.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

-1

doc/xdouble.cpp.html less more

0	0	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1	1	<html>
2	2	<head>
3		<title>/Volumes/unix-files/u/ntl-new/ntl-6.1.0/doc/xdouble.cpp.html</title>
	3	<title>/Volumes/unix-files/u/ntl-new/ntl-9.9.0dev/doc/xdouble.cpp.html</title>
4	4	<meta name="Generator" content="Vim/7.1">
5	5	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
6	6	</head>

+723

-0

include/NTL/BasicThreadPool.h less more

	0
	1	#ifndef NTL_BasicThreadPool__H
	2	#define NTL_BasicThreadPool__H
	3
	4	#include <NTL/tools.h>
	5	#include <NTL/vector.h>
	6	#include <NTL/SmartPtr.h>
	7	#include <NTL/thread.h>
	8
	9
	10	NTL_OPEN_NNS
	11
	12
	13	inline long AvailableThreads();
	14
	15	struct PartitionInfo {
	16	long nintervals; // number of intervals
	17	long intervalsz; // interval size
	18	long nsintervals; // number of small intervals
	19
	20	explicit
	21	PartitionInfo(long sz, long nt = AvailableThreads())
	22	// partitions [0..sz) into nintervals intervals,
	23	// so that there are nsintervals of size intervalsz-1
	24	// and nintervals-nsintervals of size intervalsz
	25	{
	26	if (sz <= 0) {
	27	nintervals = intervalsz = nsintervals = 0;
	28	return;
	29	}
	30
	31	if (nt <= 0) LogicError("PartitionInfo: bad args");
	32
	33	// NOTE: this overflow check probably unnecessary
	34	if (NTL_OVERFLOW(sz, 1, 0) \|\| NTL_OVERFLOW(nt, 1, 0))
	35	ResourceError("PartitionInfo: arg too big");
	36
	37	if (sz < nt) {
	38	nintervals = sz;
	39	intervalsz = 1;
	40	nsintervals = 0;
	41	return;
	42	}
	43
	44	nintervals = nt;
	45
	46	long q, r;
	47	q = sz/nt;
	48	r = sz - nt*q;
	49
	50	if (r == 0) {
	51	intervalsz = q;
	52	nsintervals = 0;
	53	}
	54	else {
	55	intervalsz = q+1;
	56	nsintervals = nt - r;
	57	}
	58	}
	59
	60	long NumIntervals() const { return nintervals; }
	61
	62	void interval(long& first, long& last, long i) const
	63	// [first..last) is the ith interval -- no range checking is done
	64	{
	65
	66	#if 0
	67	// this is the logic, naturally expressed
	68	if (i < nsintervals) {
	69	first = i*(intervalsz-1);
	70	last = first + (intervalsz-1);
	71	}
	72	else {
	73	first = nsintervals(intervalsz-1) + (i-nsintervals)intervalsz;
	74	last = first + intervalsz;
	75	}
	76	#else
	77	// this is the same logic, but branch-free (and portable)
	78	// ...probably unnecessary optimization
	79
	80	long mask = -long(cast_unsigned(i-nsintervals) >> (NTL_BITS_PER_LONG-1));
	81	// mask == -1 if i < nsintervals, 0 o/w
	82
	83	long lfirst = i*(intervalsz-1);
	84	lfirst += long((~cast_unsigned(mask)) & cast_unsigned(i-nsintervals));
	85	// lfirst += max(0, i-nsintervals)
	86
	87	long llast = lfirst + intervalsz + mask;
	88
	89	first = lfirst;
	90	last = llast;
	91	#endif
	92	}
	93
	94	};
	95
	96
	97
	98	NTL_CLOSE_NNS
	99
	100
	101
	102	#ifdef NTL_THREADS
	103
	104
	105	#include <thread>
	106	#include <condition_variable>
	107	#include <exception>
	108
	109
	110	NTL_OPEN_NNS
	111
	112	/*************************************************************
	113
	114	Some simple thread pooling.
	115
	116	You create a thread pool by constructing a BasicThreadPool object.
	117	For example:
	118
	119	long nthreads = 4;
	120	BasicThreadPool pool(nthreads);
	121
	122	creates a thread pool of 4 threads. These threads will exist
	123	until the destructor for pool is called.
	124
	125	The simplest way to use a thread pools is as follows.
	126	Suppose you have a task that consists of N subtasks,
	127	indexed 0..N-1. Then you can write:
	128
	129
	130	pool.exec_range(N,
	131	[&](long first, long last) {
	132	for (long i = first; i < last; i++) {
	133	... code to process subtask i ...
	134	}
	135	}
	136	);
	137
	138	The second argument to exec1 is a C++11 "lambda".
	139	The "[&]" indicates that all local variables in the calling
	140	context are captured by reference, so the lambda body can
	141	reference all visible local variables directly.
	142
	143	A lower-level interface is also provided.
	144	One can write:
	145
	146	pool.exec_index(n,
	147	[&](long index) {
	148	... code to process index i ...
	149	}
	150	);
	151
	152	This will activate n threads with indices 0..n-1, and execute
	153	the given code on each index. The parameter n must be
	154	in the range 1..nthreads, otherwise an error is raised.
	155
	156	This lower-level interface is useful in some cases,
	157	especially when memory is managed in some special way.
	158	For convenience, a method is provided to break
	159	subtasks up into smaller, almost-equal-sized groups
	160	of subtasks:
	161
	162	Vec<long> pvec;
	163	long n = pool.SplitProblems(N, pvec);
	164
	165	can be used for this. N is the number of subtasks, indexed 0..N-1.
	166	This method will compute n as needed by exec, and
	167	the range of subtasks to be processed by a given index in the range
	168	0..n-1 is pvec[index]..pvec[index+1]-1
	169	Thus, the logic of the above exec1 example can be written
	170	using the lower-level exec interface as follows:
	171
	172
	173	Vec<long> pvec;
	174	long n = pool.SplitProblems(N, pvec);
	175	pool.exec_index(n,
	176	[&](long index) {
	177	long first = pvec[index];
	178	long last = pvec[index+1];
	179	for (long i = first; i < last; i++) {
	180	... code to process subtask i ...
	181	}
	182	}
	183	);
	184
	185	However, with this approach, memory or other resources can be
	186	assigned to each index = 0..n-1, and managed externally.
	187
	188
	189
	190
	191	*************************************************************/
	192
	193
	194	class BasicThreadPool {
	195	private:
	196
	197	// lots of nested stuff
	198
	199	template<class T>
	200	class SimpleSignal {
	201	private:
	202	T val;
	203	std::mutex m;
	204	std::condition_variable cv;
	205
	206	SimpleSignal(const SimpleSignal&); // disabled
	207	void operator=(const SimpleSignal&); // disabled
	208
	209	public:
	210	SimpleSignal() : val(0) { }
	211
	212	T wait()
	213	{
	214	std::unique_lock<std::mutex> lock(m);
	215	cv.wait(lock, [&]() { return val; } );
	216	T old_val = val;
	217	val = 0;
	218	return old_val;
	219	}
	220
	221	void send(T new_val)
	222	{
	223	std::lock_guard<std::mutex> lock(m);
	224	val = new_val;
	225	cv.notify_one();
	226	}
	227	};
	228
	229
	230	template<class T, class T1>
	231	class CompositeSignal {
	232	private:
	233	T val;
	234	T1 val1;
	235	std::mutex m;
	236	std::condition_variable cv;
	237
	238	CompositeSignal(const CompositeSignal&); // disabled
	239	void operator=(const CompositeSignal&); // disabled
	240
	241	public:
	242	CompositeSignal() : val(0) { }
	243
	244	T wait(T1& _val1)
	245	{
	246	std::unique_lock<std::mutex> lock(m);
	247	cv.wait(lock, [&]() { return val; } );
	248	T _val = val;
	249	_val1 = val1;
	250	val = 0;
	251	return _val;
	252	}
	253
	254	void send(T _val, T1 _val1)
	255	{
	256	std::lock_guard<std::mutex> lock(m);
	257	val = _val;
	258	val1 = _val1;
	259	cv.notify_one();
	260	}
	261	};
	262
	263
	264
	265	class ConcurrentTask {
	266	BasicThreadPool *pool;
	267	public:
	268	ConcurrentTask(BasicThreadPool *_pool) : pool(_pool) { }
	269	BasicThreadPool *getBasicThreadPool() const { return pool; }
	270
	271	virtual void run(long index) = 0;
	272	};
	273
	274
	275
	276	// dummy class, used for signalling termination
	277	class ConcurrentTaskTerminate : public ConcurrentTask {
	278	public:
	279	ConcurrentTaskTerminate() : ConcurrentTask(0) { }
	280	void run(long index) { }
	281	};
	282
	283
	284
	285	template<class Fct>
	286	class ConcurrentTaskFct : public ConcurrentTask {
	287	public:
	288	const Fct& fct;
	289
	290	ConcurrentTaskFct(BasicThreadPool *_pool, const Fct& _fct) :
	291	ConcurrentTask(_pool), fct(_fct) { }
	292
	293	void run(long index) { fct(index); }
	294	};
	295
	296	template<class Fct>
	297	class ConcurrentTaskFct1 : public ConcurrentTask {
	298	public:
	299	const Fct& fct;
	300	const PartitionInfo& pinfo;
	301
	302	ConcurrentTaskFct1(BasicThreadPool *_pool, const Fct& _fct,
	303	const PartitionInfo& _pinfo) :
	304	ConcurrentTask(_pool), fct(_fct), pinfo(_pinfo) { }
	305
	306	void run(long index)
	307	{
	308	long first, last;
	309	pinfo.interval(first, last, index);
	310	fct(first, last);
	311	}
	312	};
	313
	314
	315
	316	struct AutomaticThread {
	317	CompositeSignal< ConcurrentTask *, long > localSignal;
	318	ConcurrentTaskTerminate term;
	319	std::thread t;
	320
	321
	322	AutomaticThread() : t(worker, &localSignal)
	323	{
	324	// cerr << "starting thread " << t.get_id() << "\n";
	325	}
	326
	327	~AutomaticThread()
	328	{
	329	// cerr << "stopping thread " << t.get_id() << "...";
	330	localSignal.send(&term, -1);
	331	t.join();
	332	// cerr << "\n";
	333	}
	334	};
	335
	336
	337
	338	// BasicThreadPool data members
	339
	340	long nthreads;
	341
	342	bool active_flag;
	343
	344	std::atomic<long> counter;
	345	SimpleSignal<bool> globalSignal;
	346
	347	Vec< UniquePtr<AutomaticThread> > threadVec;
	348
	349	std::exception_ptr eptr;
	350	std::mutex eptr_guard;
	351
	352	// BasicThreadPool private member functions
	353
	354	BasicThreadPool(const BasicThreadPool&); // disabled
	355	void operator=(const BasicThreadPool&); // disabled
	356
	357	void launch(ConcurrentTask *task, long index)
	358	{
	359	threadVec[index]->localSignal.send(task, index);
	360	}
	361
	362	void begin(long cnt)
	363	{
	364
	365	active_flag = true;
	366	counter = cnt;
	367	}
	368
	369	void end()
	370	{
	371	globalSignal.wait();
	372
	373	active_flag = false;
	374
	375	if (eptr) {
	376	std::exception_ptr eptr1 = eptr;
	377	eptr = nullptr;
	378	std::rethrow_exception(eptr1);
	379	}
	380	}
	381
	382	static void runOneTask(ConcurrentTask *task, long index)
	383	{
	384	BasicThreadPool *pool = task->getBasicThreadPool();
	385
	386	try {
	387	task->run(index);
	388	}
	389	catch (...) {
	390	std::lock_guard<std::mutex> lock(pool->eptr_guard);
	391	if (!pool->eptr) pool->eptr = std::current_exception();
	392	}
	393
	394	if (--(pool->counter) == 0) pool->globalSignal.send(true);
	395	}
	396
	397	static void worker(CompositeSignal< ConcurrentTask , long > localSignal)
	398	{
	399	for (;;) {
	400	long index = -1;
	401	ConcurrentTask *task = localSignal->wait(index);
	402	if (index == -1) return;
	403
	404	runOneTask(task, index);
	405	}
	406	}
	407
	408
	409	public:
	410
	411
	412	long NumThreads() const { return nthreads; }
	413	bool active() const { return active_flag; }
	414
	415	explicit
	416	BasicThreadPool(long _nthreads) :
	417	nthreads(_nthreads), active_flag(false), counter(0)
	418	{
	419	if (nthreads <= 0) LogicError("BasicThreadPool::BasicThreadPool: bad args");
	420
	421	if (NTL_OVERFLOW(nthreads, 1, 0))
	422	ResourceError("BasicThreadPool::BasicThreadPool: arg too big");
	423
	424	threadVec.SetLength(nthreads-1);
	425
	426	for (long i = 0; i < nthreads-1; i++) {
	427	threadVec[i].make();
	428	}
	429	}
	430
	431	~BasicThreadPool()
	432	{
	433	if (active()) TerminalError("BasicThreadPool: destructor called while active");
	434	}
	435
	436
	437	// adding, deleting, moving threads
	438
	439	void add(long n = 1)
	440	{
	441	if (active()) LogicError("BasicThreadPool: illegal operation while active");
	442	if (n <= 0) LogicError("BasicThreadPool::add: bad args");
	443	if (NTL_OVERFLOW(n, 1, 0))
	444	ResourceError("BasicThreadPool::add: arg too big");
	445
	446	Vec< UniquePtr<AutomaticThread> > newThreads;
	447
	448	newThreads.SetLength(n);
	449	for (long i = 0; i < n; i++)
	450	newThreads[i].make();
	451
	452	threadVec.SetLength(n + nthreads - 1);
	453	for (long i = 0; i < n; i++)
	454	threadVec[nthreads-1+i].move(newThreads[i]);
	455
	456	nthreads += n;
	457	}
	458
	459
	460	void remove(long n = 1)
	461	{
	462	if (active()) LogicError("BasicThreadPool: illegal operation while active");
	463	if (n <= 0 \|\| n >= nthreads) LogicError("BasicThreadPool::remove: bad args");
	464
	465	for (long i = nthreads-1-n; i < nthreads-1; i++)
	466	threadVec[i] = 0;
	467
	468	threadVec.SetLength(nthreads-1-n);
	469	nthreads -= n;
	470	}
	471
	472
	473	void move(BasicThreadPool& other, long n = 1)
	474	{
	475	if (active() \|\| other.active())
	476	LogicError("BasicThreadPool: illegal operation while active");
	477	if (n <= 0 \|\| n >= other.nthreads) LogicError("BasicThreadPool::move: bad args");
	478
	479	if (this == &other) return;
	480
	481	threadVec.SetLength(n + nthreads - 1);
	482	for (long i = 0; i < n; i++)
	483	threadVec[nthreads-1+i].move(other.threadVec[other.nthreads-1-n+i]);
	484
	485	other.threadVec.SetLength(other.nthreads-1-n);
	486	other.nthreads -= n;
	487
	488	nthreads += n;
	489	}
	490
	491
	492
	493	// High level interfaces, intended to be used with lambdas
	494
	495	// In this version, fct takes one argument, which is
	496	// an index in [0..cnt)
	497
	498	template<class Fct>
	499	void exec_index(long cnt, const Fct& fct)
	500	{
	501	if (active()) LogicError("BasicThreadPool: illegal operation while active");
	502	if (cnt <= 0) return;
	503	if (cnt > nthreads) LogicError("BasicThreadPool::exec_index: bad args");
	504
	505	ConcurrentTaskFct<Fct> task(this, fct);
	506
	507	begin(cnt);
	508	for (long t = 0; t < cnt-1; t++) launch(&task, t);
	509	runOneTask(&task, cnt-1);
	510	end();
	511	}
	512
	513	template<class Fct>
	514	static void relaxed_exec_index(BasicThreadPool *pool, long cnt, const Fct& fct)
	515	{
	516	if (cnt <= 0) return;
	517	if (!pool \|\| pool->active()) {
	518	if (cnt > 0) LogicError("friendly_exec_index: not enough threads");
	519	fct(0);
	520	}
	521	else {
	522	pool->exec_index(cnt, fct);
	523	}
	524	}
	525
	526	// even higher level version: sz is the number of subproblems,
	527	// and fct takes two args, first and last, so that subproblems
	528	// [first..last) are processed.
	529
	530	template<class Fct>
	531	void exec_range(long sz, const Fct& fct)
	532	{
	533	if (active()) LogicError("BasicThreadPool: illegal operation while active");
	534	if (sz <= 0) return;
	535
	536	PartitionInfo pinfo(sz, nthreads);
	537
	538	long cnt = pinfo.NumIntervals();
	539	ConcurrentTaskFct1<Fct> task(this, fct, pinfo);
	540
	541	begin(cnt);
	542	for (long t = 0; t < cnt-1; t++) launch(&task, t);
	543	runOneTask(&task, cnt-1);
	544	end();
	545	}
	546
	547	template<class Fct>
	548	static void relaxed_exec_range(BasicThreadPool *pool, long sz, const Fct& fct)
	549	{
	550	if (sz <= 0) return;
	551	if (!pool \|\| pool->active() \|\| sz == 1) {
	552	fct(0, sz);
	553	}
	554	else {
	555	pool->exec_range(sz, fct);
	556	}
	557	}
	558
	559	};
	560
	561
	562
	563
	564	NTL_CLOSE_NNS
	565
	566
	567	#endif
	568
	569
	570
	571	#ifdef NTL_THREAD_BOOST
	572
	573	#ifndef NTL_THREADS
	574	#error "NTL_THREAD_BOOST requires NTL_THREADS"
	575	#endif
	576
	577	NTL_OPEN_NNS
	578
	579	extern
	580	NTL_CHEAP_THREAD_LOCAL BasicThreadPool *NTLThreadPool_ptr;
	581
	582	inline
	583	BasicThreadPool *GetThreadPool()
	584	{
	585	return NTLThreadPool_ptr;
	586	}
	587
	588	void ResetThreadPool(BasicThreadPool *pool = 0);
	589	BasicThreadPool *ReleaseThreadPool();
	590
	591	inline void SetNumThreads(long n)
	592	{
	593	ResetThreadPool(MakeRaw<BasicThreadPool>(n));
	594	}
	595
	596	inline long AvailableThreads()
	597	{
	598	BasicThreadPool *pool = GetThreadPool();
	599	if (!pool \|\| pool->active())
	600	return 1;
	601	else
	602	return pool->NumThreads();
	603	}
	604
	605
	606	NTL_CLOSE_NNS
	607
	608
	609	#define NTL_EXEC_RANGE(n, first, last) \
	610	{ \
	611	NTL_NNS BasicThreadPool::relaxed_exec_range(NTL_NNS GetThreadPool(), (n), \
	612	[&](long first, long last) { \
	613
	614
	615	#define NTL_EXEC_RANGE_END \
	616	} ); \
	617	} \
	618
	619
	620	#define NTL_GEXEC_RANGE(seq, n, first, last) \
	621	{ \
	622	NTL_NNS BasicThreadPool::relaxed_exec_range((seq) ? 0 : NTL_NNS GetThreadPool(), (n), \
	623	[&](long first, long last) { \
	624
	625
	626	#define NTL_GEXEC_RANGE_END \
	627	} ); \
	628	} \
	629
	630
	631	#define NTL_EXEC_INDEX(n, index) \
	632	{ \
	633	NTL_NNS BasicThreadPool::relaxed_exec_index(NTL_NNS GetThreadPool(), (n), \
	634	[&](long index) { \
	635
	636
	637	#define NTL_EXEC_INDEX_END \
	638	} ); \
	639	} \
	640
	641
	642
	643	// NOTE: at least with gcc >= 4.9.2, the GEXEC versions will evaluate seq, and
	644	// if it is true, jump directly (more or less) to the body
	645
	646
	647	#define NTL_TBDECL(x) static void basic_ ## x
	648	#define NTL_TBDECL_static(x) static void basic_ ## x
	649
	650
	651	#else
	652
	653	NTL_OPEN_NNS
	654
	655
	656	inline void SetNumThreads(long n) { }
	657
	658	inline long AvailableThreads() { return 1; }
	659
	660
	661	NTL_CLOSE_NNS
	662
	663	#define NTL_EXEC_RANGE(n, first, last) \
	664	{ \
	665	long _ntl_par_exec_n = (n); \
	666	if (_ntl_par_exec_n > 0) { \
	667	long first = 0; \
	668	long last = _ntl_par_exec_n; \
	669	{ \
	670
	671
	672	#define NTL_EXEC_RANGE_END }}}
	673
	674	#define NTL_GEXEC_RANGE(seq, n, first, last) \
	675	{ \
	676	long _ntl_par_exec_n = (n); \
	677	if (_ntl_par_exec_n > 0) { \
	678	long first = 0; \
	679	long last = _ntl_par_exec_n; \
	680	{ \
	681
	682
	683	#define NTL_GEXEC_RANGE_END }}}
	684
	685
	686
	687
	688	#define NTL_EXEC_INDEX(n, index) \
	689	{ \
	690	long _ntl_par_exec_n = (n); \
	691	if (_ntl_par_exec_n > 0) { \
	692	if (_ntl_par_exec_n > 1) NTL_NNS LogicError("NTL_EXEC_INDEX: not enough threads"); \
	693	long index = 0; \
	694	{ \
	695
	696
	697	#define NTL_EXEC_INDEX_END }}}
	698
	699
	700
	701	#define NTL_TBDECL(x) void x
	702	#define NTL_TBDECL_static(x) static void x
	703
	704	#endif
	705
	706
	707
	708	#ifdef NTL_THREADS
	709
	710	#define NTL_IMPORT(x) auto _ntl_hidden_variable_IMPORT__ ## x = x; auto x = _ntl_hidden_variable_IMPORT__ ##x;
	711
	712	#else
	713
	714	#define NTL_IMPORT(x)
	715
	716
	717	#endif
	718
	719
	720
	721	#endif
	722

-2

include/NTL/GF2E.h less more

31	31	Lazy<ZZ> _card;
32	32	};
33	33
34		NTL_THREAD_LOCAL
35		extern SmartPtr<GF2EInfoT> GF2EInfo; // info for current modulus, initially null
	34	extern
	35	NTL_CHEAP_THREAD_LOCAL
	36	GF2EInfoT *GF2EInfo;
	37	// info for current modulus, initially null
	38	// fast TLS access
36	39
37	40
38	41

-2

include/NTL/GF2EX.h less more

848	848	vec_GF2EX H;
849	849	};
850	850
851		NTL_THREAD_LOCAL
852		extern long GF2EXArgBound;
	851	extern
	852	NTL_CHEAP_THREAD_LOCAL
	853	long GF2EXArgBound;
853	854
854	855
855	856	void build(GF2EXArgument& H, const GF2EX& h, const GF2EXModulus& F, long m);

-6

include/NTL/GF2EXFactoring.h less more

68	68	// Uses "Berlekamp" appraoch.
69	69
70	70
71		NTL_THREAD_LOCAL
72		extern long GF2EX_BlockingFactor;
	71	extern
	72	NTL_CHEAP_THREAD_LOCAL
	73	long GF2EX_BlockingFactor;
73	74	// Controls GCD blocking for DDF.
74	75
75	76	void DDF(vec_pair_GF2EX_long& factors, const GF2EX& f, const GF2EX& h,

84	85	// Assumes f is monic and square-free, and h = X^p mod f
85	86	// Obsolete: see NewDDF, below.
86	87
87		NTL_THREAD_LOCAL
88		extern long GF2EX_GCDTableSize; /* = 4 */
	88	extern
	89	NTL_CHEAP_THREAD_LOCAL
	90	long GF2EX_GCDTableSize; /* = 4 */
89	91	// Controls GCD blocking for NewDDF
90	92
91	93
92		NTL_THREAD_LOCAL
93		extern double GF2EXFileThresh;
	94	extern
	95	NTL_CHEAP_THREAD_LOCAL
	96	double GF2EXFileThresh;
94	97	// external files are used for baby/giant steps if size
95	98	// of these tables exceeds GF2EXFileThresh KB.
96	99

-2

include/NTL/GF2X.h less more

55	55
56	56
57	57
58		NTL_THREAD_LOCAL static long HexOutput;
	58	static NTL_CHEAP_THREAD_LOCAL long HexOutput;
59	59
60	60	inline GF2X(long i, GF2 c);
61	61	inline GF2X(long i, long c);

737	737	~GF2XWatcher() { watched.KillBig(); }
738	738	};
739	739
740		#define NTL_GF2XRegister(x) NTL_THREAD_LOCAL static GF2X x; GF2XWatcher _WATCHER__ ## x(x)
	740	#define NTL_GF2XRegister(x) NTL_TLS_LOCAL(GF2X, x); GF2XWatcher _WATCHER__ ## x(x)
741	741
742	742
743	743

-0

include/NTL/HAVE_AVX.h less more

-0

include/NTL/HAVE_BUILTIN_CLZL.h less more

-0

include/NTL/HAVE_FMA.h less more

-0

include/NTL/HAVE_LL_TYPE.h less more

-2

include/NTL/LLL.h less more

25	25
26	26	typedef long (*LLLCheckFct)(const vec_ZZ&);
27	27
28		NTL_THREAD_LOCAL extern double LLLStatusInterval;
29		NTL_THREAD_LOCAL extern char *LLLDumpFile;
	28	extern NTL_CHEAP_THREAD_LOCAL double LLLStatusInterval;
	29	extern NTL_CHEAP_THREAD_LOCAL char *LLLDumpFile;
30	30
31	31
32	32	// classical Gramm-Schmidt versions

-2

include/NTL/LazyTable.h less more

49	49	LazyTable();
50	50
51	51
52		const T * const operator[] (long i) const;
	52	const T * operator[] (long i) const;
53	53
54	54	~LazyTable();
55	55

97	97	public:
98	98	LazyTable() : len(0) { }
99	99
100		const T * const operator[] (long i) const
	100	const T * operator[] (long i) const
101	101	{
102	102	// FIXME: add optional range checking
103	103

-2

include/NTL/RR.h less more

43	43	const ZZ& mantissa() const { return x; }
44	44	long exponent() const { return e; }
45	45
46		NTL_THREAD_LOCAL static long prec;
	46	static NTL_CHEAP_THREAD_LOCAL long prec;
47	47	static void SetPrecision(long p);
48	48	static long precision() { return prec; }
49	49
50		NTL_THREAD_LOCAL static long oprec;
	50	static NTL_CHEAP_THREAD_LOCAL long oprec;
51	51	static void SetOutputPrecision(long p);
52	52	static long OutputPrecision() { return oprec; }
53	53

-0

include/NTL/SPMM_ASM.h less more

131	131	unsigned long hi, lo;
132	132	__asm__ ("mulq %3" : "=a" (lo), "=d" (hi) : "%0" (a), "rm" (b));
133	133
	134	//__asm__ ("mulxq %2,%1,%0" : "=r" (hi), "=r" (lo) : "rm" (a), "d" (b));
	135	// this uses the mulx instruction - no real benefit
	136
134	137	return hi;
135	138	}
136	139

+125

-1

include/NTL/SmartPtr.h less more

201	201	if (dp) {
202	202	cp = NTL_NEW_OP SmartPtrControlDerived<T>(dp);
203	203	if (!cp) {
204		delete dp; // if we throw an exception
	204	delete dp; // this could theoretically throw an exception
205	205	MemoryError();
206	206	}
207	207	AddRef();

843	843	T* release() { T *p = dp; dp = 0; return p; }
844	844	void move(UniquePtr& other) { reset(other.release()); }
845	845
	846	template<class Y>
	847	void move(UniquePtr<Y>& other) { reset(other.release()); }
	848
846	849	void swap(UniquePtr& other)
847	850	{
848	851	_ntl_swap(dp, other.dp);

918	921	// using psuedo variadic templates
919	922
920	923	p1.reset(rp); // destroy's p1's referent and assign rp
	924
921	925
922	926	if (p1.exists()) ... // test for null
923	927
924	928	p1.val() // dereference
925	929
	930	rp = p1.get(); // fetch raw pointer
	931	rp = p1.release(); // fetch raw pointer, and set to NULL
926	932	p1.move(p2); // if p1 != p2 then:
927	933	// makes p1 point to p2's referent,
928	934	// setting p2 to NULL and destroying

983	989	T& val() const { return *dp; }
984	990
985	991	bool exists() const { return dp != 0; }
	992
	993	T* get() const { return dp.get(); }
	994
	995	T* release() { return dp.release(); }
986	996
987	997	void move(OptionalVal& other) { dp.move(other.dp); }
988	998

1084	1094	T& operator[](long i) const { return dp[i]; }
1085	1095
1086	1096	T* get() const { return dp; }
	1097	T *elts() const { return dp; }
1087	1098
1088	1099	T* release() { T *p = dp; dp = 0; return p; }
1089	1100	void move(UniqueArray& other) { reset(other.release()); }

1344	1355
1345	1356
1346	1357
	1358	// AlignedArray:
	1359	//
	1360	// specialized arrays that have similar interface to UniqueArray, but:
	1361	// * they are allocated with a given alignment
	1362	// * they (currently) only work on POD types
	1363	// the current implementation uses posix_memalign, which seems
	1364	// to work on gcc and gcc clones (clang and icc).
	1365	// intended for use with Intel AVX intrinsics
	1366	//
	1367	// For now, this is not a part of the documented interface, and it is only
	1368	// works with __GNUC__. If __GNUC__ is not defined, then it reverts to using
	1369	// malloc. Currently, it is only really needed if NTL_HAVE_AVX is defined,
	1370	// which anyway requires __GNUC__.
	1371	//
	1372	// This could all change in the future, if and when there is a more portable
	1373	// way of doing this.
	1374
	1375	// NOTE: the methods reset, free, and release are available, but should really
	1376	// only be used to move raw pointers around between compatible AlignedArray's.
	1377
	1378	// NOTE: posix_memalign has been in available since glibc 2.1.91, which is some
	1379	// time around the year 2000, so this should be portable.
	1380
	1381	template<class T, long align=NTL_DEFAULT_ALIGN>
	1382	class AlignedArray {
	1383	private:
	1384	T *dp;
	1385
	1386	class Dummy { };
	1387
	1388	typedef void (AlignedArray::*fake_null_type)(Dummy) const;
	1389	void fake_null_function(Dummy) const {}
	1390
	1391	bool cannot_compare_these_types() const { return false; }
	1392
	1393	AlignedArray(const AlignedArray&); // disabled
	1394	void operator=(const AlignedArray&); // disabled
	1395
	1396	public:
	1397	explicit AlignedArray(T *p) : dp(p) { }
	1398
	1399	AlignedArray() : dp(0) { }
	1400
	1401	~AlignedArray() { NTL_SNS free(dp); }
	1402
	1403
	1404	void reset(T* p = 0)
	1405	{
	1406	AlignedArray tmp(p);
	1407	tmp.swap(*this);
	1408	}
	1409
	1410	AlignedArray& operator=(fake_null_type) { reset(); return *this; }
	1411
	1412	void SetLength(long n)
	1413	{
	1414	using namespace std;
	1415	// not clear if posix_memalign is in std:: or ::
	1416	// this will make sure to find it in either case
	1417
	1418	if (align <= 0 \|\| n < 0) LogicError("AlignedArray::SetLength: bad args");
	1419	if (NTL_OVERFLOW1(n, sizeof(T), 0)) ResourceError("AlignedArray::SetLength: overflow");
	1420
	1421	if (n == 0) {
	1422	reset();
	1423	}
	1424	else
	1425	{
	1426	void *p;
	1427
	1428	#ifdef __GNUC__
	1429	#define NTL_HAVE_ALIGNED_ARRAY
	1430	if (posix_memalign(&p, align, n*sizeof(T))) MemoryError();
	1431	#else
	1432	p = malloc(n*sizeof(T));
	1433	if (!p) MemoryError();
	1434	#endif
	1435
	1436	reset( (T*) p );
	1437	}
	1438	}
	1439
	1440	T& operator[](long i) const { return dp[i]; }
	1441
	1442	T* get() const { return dp; }
	1443	T* elts() const { return dp; }
	1444
	1445	T* release() { T *p = dp; dp = 0; return p; }
	1446	void move(AlignedArray& other) { reset(other.release()); }
	1447
	1448	void swap(AlignedArray& other)
	1449	{
	1450	_ntl_swap(dp, other.dp);
	1451	}
	1452
	1453	AlignedArray(fake_null_type) : dp(0) { }
	1454
	1455	operator fake_null_type() const
	1456	{
	1457	return dp ? &AlignedArray::fake_null_function : 0;
	1458	}
	1459
	1460	};
	1461
	1462
	1463	// free swap function
	1464	template<class T, long align>
	1465	void swap(AlignedArray<T,align>& p, AlignedArray<T,align>& q) { p.swap(q); }
	1466
	1467
	1468
	1469
	1470
1347	1471
1348	1472
1349	1473

+136

-2

include/NTL/ZZ.h less more

151	151	~ZZWatcher() { watched.KillBig(); }
152	152	};
153	153
154		#define NTL_ZZRegister(x) NTL_THREAD_LOCAL static ZZ x; ZZWatcher _WATCHER__ ## x(x)
	154	#define NTL_ZZRegister(x) NTL_TLS_LOCAL(ZZ, x); ZZWatcher _WATCHER__ ## x(x)
155	155
156	156
157	157

568	568	}
569	569
570	570
	571	// montgomery
	572	class ZZ_ReduceStructAdapter {
	573	public:
	574	UniquePtr<_ntl_reduce_struct> rep;
	575
	576	void init(const ZZ& p, const ZZ& excess)
	577	{
	578	rep.reset(_ntl_reduce_struct_build(p.rep, excess.rep));
	579	}
	580
	581	void eval(ZZ& x, ZZ& a) const
	582	{
	583	rep->eval(&x.rep, &a.rep);
	584	}
	585
	586	void adjust(ZZ& x) const
	587	{
	588	rep->adjust(&x.rep);
	589	}
	590	};
	591
571	592
572	593
573	594	/*******************************************************

652	673
653	674	inline ZZ& operator%=(ZZ& x, const ZZ& b)
654	675	{ rem(x, x, b); return x; }
	676
	677
	678	// preconditioned single-precision variant
	679	// not documented for now...
	680
	681
	682	class PreconditionedRemainder {
	683	private:
	684	long p;
	685	UniquePtr<_ntl_general_rem_one_struct> pinfo;
	686
	687	public:
	688	PreconditionedRemainder(long _p, long sz) : p(_p)
	689	{
	690	pinfo.reset(_ntl_general_rem_one_struct_build(p, sz));
	691	}
	692
	693
	694	long operator()(const ZZ& a)
	695	{
	696	return _ntl_general_rem_one_struct_apply(a.rep, p, pinfo.get());
	697	}
	698	};
	699
655	700
656	701
657	702	/**********************************************************

929	974	************************************************************/
930	975
931	976
	977	// ================ NEW PRG STUFF =================
	978
	979
	980	// Low-level key-derivation
	981
	982
	983	void DeriveKey(unsigned char *key, long klen,
	984	const unsigned char *data, long dlen);
	985
	986
	987
	988	// Low-level chacha stuff
	989
	990	#define NTL_PRG_KEYLEN (32)
	991
	992	class RandomStream {
	993	private:
	994	_ntl_uint32 state[16];
	995	unsigned char buf[64];
	996	long pos;
	997
	998	void do_get(unsigned char *res, long n);
	999
	1000	public:
	1001	explicit
	1002	RandomStream(const unsigned char *key);
	1003
	1004	// No default constructor
	1005	// default copy and assignment
	1006
	1007	void get(unsigned char *res, long n)
	1008	{
	1009	// optimize short reads
	1010	if (n >= 0 && n <= 64-pos) {
	1011	long i;
	1012	for (i = 0; i < n; i++) {
	1013	res[i] = buf[pos+i];
	1014	}
	1015	pos += n;
	1016	}
	1017	else {
	1018	do_get(res, n);
	1019	}
	1020	}
	1021
	1022	};
	1023
	1024
	1025
	1026
	1027	RandomStream& GetCurrentRandomStream();
	1028	// get reference to the current random by stream --
	1029	// if SetSeed has not been called, it is called with
	1030	// a default value (which should be unique to each
	1031	// process/thread
	1032
	1033
932	1034	void SetSeed(const ZZ& s);
	1035	void SetSeed(const unsigned char *data, long dlen);
	1036	void SetSeed(const RandomStream& s);
933	1037	// initialize random number generator
	1038	// in the first two version, a PRG key is derived from
	1039	// the data using DeriveKey.
	1040
	1041
	1042	// RAII for saving/restoring current state of PRG
	1043
	1044	class RandomStreamPush {
	1045	private:
	1046	RandomStream saved;
	1047
	1048	RandomStreamPush(const RandomStreamPush&); // disable
	1049	void operator=(const RandomStreamPush&); // disable
	1050
	1051	public:
	1052	RandomStreamPush() : saved(GetCurrentRandomStream()) { }
	1053	~RandomStreamPush() { SetSeed(saved); }
	1054
	1055	};
	1056
	1057
934	1058
935	1059
936	1060	void RandomBnd(ZZ& x, const ZZ& n);

958	1082	// single-precision version of the above
959	1083
960	1084	long RandomBnd(long n);
	1085	inline void RandomBnd(long& x, long n) { x = RandomBnd(n); }
961	1086
962	1087	long RandomLen_long(long l);
	1088	inline void RandomLen(long& x, long l) { x = RandomLen_long(l); }
963	1089
964	1090	long RandomBits_long(long l);
	1091	inline void RandomBits(long& x, long l) { x = RandomBits_long(l); }
	1092
	1093
	1094	// specialty routines
965	1095
966	1096	unsigned long RandomWord();
967	1097	unsigned long RandomBits_ulong(long l);

1211	1341
1212	1342
1213	1343	inline long InvModStatus(ZZ& x, const ZZ& a, const ZZ& n)
1214		// if gcd(a,b) = 1, then ReturnValue = 0, x = a^{-1} mod n
	1344	// if gcd(a,n) = 1, then ReturnValue = 0, x = a^{-1} mod n
1215	1345	// otherwise, ReturnValue = 1, x = gcd(a, n)
1216	1346
1217	1347	{ return NTL_zinv(a.rep, n.rep, &x.rep); }

1329	1459
1330	1460	long InvMod(long a, long n);
1331	1461	// computes a^{-1} mod n. Error is raised if undefined.
	1462
	1463	long InvModStatus(long& x, long a, long n);
	1464	// if gcd(a,n) = 1, then ReturnValue = 0, x = a^{-1} mod n
	1465	// otherwise, ReturnValue = 1, x = gcd(a, n)
1332	1466
1333	1467	long PowerMod(long a, long e, long n);
1334	1468	// computes a^e mod n, e >= 0

-5

include/NTL/ZZXFactoring.h less more

40	40	// f divides a polynomial h whose Euclidean norm
41	41	// is bounded by 2^{bnd} in absolute value.
42	42
43		NTL_THREAD_LOCAL extern long ZZXFac_MaxPrune;
44		NTL_THREAD_LOCAL extern long ZZXFac_InitNumPrimes;
45		NTL_THREAD_LOCAL extern long ZZXFac_MaxNumPrimes;
46		NTL_THREAD_LOCAL extern long ZZXFac_PowerHack;
47		NTL_THREAD_LOCAL extern long ZZXFac_van_Hoeij;
	43	extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxPrune;
	44	extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_InitNumPrimes;
	45	extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxNumPrimes;
	46	extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_PowerHack;
	47	extern NTL_CHEAP_THREAD_LOCAL long ZZXFac_van_Hoeij;
48	48
49	49
50	50	void factor(ZZ& c,

+27

-16

include/NTL/ZZ_p.h less more

23	23
24	24	long NumPrimes;
25	25	long MaxRoot;
26		bool QuickCRT;
27	26	ZZ MinusMModP; // -M mod p, M = product of primes
28	27	ZZ_CRTStructAdapter crt_struct;
29	28	ZZ_RemStructAdapter rem_struct;
30	29
31	30
32	31	// the following arrays are indexed 0..NumPrimes-1
33		// q = FFTPrime[i]
34		Vec<double> x; // u/q, where u = (M/q)^{-1} mod q
35		Vec<long> u; // u, as above
	32	// q[i] = FFTPrime[i]
	33	Vec<long> prime; // prime[i] = q[i]
	34	Vec<double> prime_recip; // prime_recip[i] = 1/double(q[i])
	35	Vec<long> u; // u[i] = (M/q[i])^{-1} mod q[i]
36	36	Vec<mulmod_precon_t> uqinv;
	37
	38	ZZ_ReduceStructAdapter reduce_struct;
	39
37	40	};
38	41
39	42

67	70	ZZ_TmpVecAdapter rem_tmp_vec;
68	71	};
69	72
70		NTL_THREAD_LOCAL
71		extern SmartPtr<ZZ_pInfoT> ZZ_pInfo;
	73
	74	extern
	75	NTL_CHEAP_THREAD_LOCAL
	76	ZZ_pInfoT *ZZ_pInfo;
72	77	// info for current modulus, initially null
73
74		NTL_THREAD_LOCAL
75		extern SmartPtr<ZZ_pTmpSpaceT> ZZ_pTmpSpace;
	78	// plain pointer for faster TLS access
	79
	80	extern
	81	NTL_CHEAP_THREAD_LOCAL
	82	ZZ_pTmpSpaceT *ZZ_pTmpSpace;
76	83	// space for temps associated with current modulus,
77
78		NTL_THREAD_LOCAL
79		extern bool ZZ_pInstalled;
	84	// plain pointer for faster TLS access
	85
	86	extern
	87	NTL_CHEAP_THREAD_LOCAL
	88	bool ZZ_pInstalled;
80	89	// flag indicating if current modulus is fully installed
81	90
82	91

93	102
94	103	// copy constructor, assignment, destructor: default
95	104
96		void save() { ptr = ZZ_pInfo; }
	105	void save();
97	106	void restore() const;
98	107
99	108	};

155	164
156	165	typedef void (*DivHandlerPtr)(const ZZ_p& a); // error-handler for division
157	166
158		NTL_THREAD_LOCAL static DivHandlerPtr DivHandler;
	167	static
	168	NTL_CHEAP_THREAD_LOCAL
	169	DivHandlerPtr DivHandler;
159	170
160	171
161	172	// ****** constructors and assignment

206	217	static ZZ_pTmpSpaceT* GetTmpSpace()
207	218	{
208	219	install();
209		return ZZ_pTmpSpace.get();
	220	return ZZ_pTmpSpace;
210	221	}
211	222
212	223

521	532	~ZZ_pWatcher() { watched.KillBig(); }
522	533	};
523	534
524		#define NTL_ZZ_pRegister(x) NTL_THREAD_LOCAL static ZZ_p x; ZZ_pWatcher _WATCHER__ ## x(x); x.allocate()
	535	#define NTL_ZZ_pRegister(x) NTL_TLS_LOCAL(ZZ_p, x); ZZ_pWatcher _WATCHER__ ## x(x); x.allocate()
525	536
526	537	// FIXME: register variables that are allocated with respect to one modulus
527	538	// and then reused with another modulus may have initial values that are

-2

include/NTL/ZZ_pE.h less more

30	30
31	31	};
32	32
33		NTL_THREAD_LOCAL
34		extern SmartPtr<ZZ_pEInfoT> ZZ_pEInfo; // info for current modulus, initially null
	33	extern
	34	NTL_CHEAP_THREAD_LOCAL
	35	ZZ_pEInfoT *ZZ_pEInfo;
	36	// info for current modulus, initially null
	37	// raw pointer for faster TLS access
35	38
36	39
37	40

-1

include/NTL/ZZ_pEX.h less more

847	847	vec_ZZ_pEX H;
848	848	};
849	849
850		NTL_THREAD_LOCAL extern long ZZ_pEXArgBound;
	850	extern NTL_CHEAP_THREAD_LOCAL long ZZ_pEXArgBound;
851	851
852	852
853	853	void build(ZZ_pEXArgument& H, const ZZ_pEX& h, const ZZ_pEXModulus& F, long m);

-2

include/NTL/ZZ_pEXFactoring.h less more

35	35	// assumes that f is monic and splits into distinct linear factors
36	36
37	37
38		NTL_THREAD_LOCAL extern long ZZ_pEX_GCDTableSize; /* = 4 */
	38	extern
	39	NTL_CHEAP_THREAD_LOCAL
	40	long ZZ_pEX_GCDTableSize; /* = 4 */
39	41	// Controls GCD blocking for NewDDF
40	42
41		NTL_THREAD_LOCAL extern double ZZ_pEXFileThresh;
	43	extern
	44	NTL_CHEAP_THREAD_LOCAL
	45	double ZZ_pEXFileThresh;
42	46	// of these tables exceeds ZZ_pEXFileThresh KB.
43	47
44	48

-1

include/NTL/ZZ_pX.h less more

655	655	// converts coefficients lo..hi to a 2^k-point FFTRep.
656	656	// must have hi-lo+1 < 2^k
657	657
	658
	659	void FromFFTRep(ZZ_pXModRep& x, const FFTRep& a);
	660	// for testing and timing purposes only -- converts from FFTRep
	661
	662	void FromZZ_pXModRep(ZZ_pX& x, const ZZ_pXModRep& a, long lo, long hi);
	663	// for testing and timing purposes only -- converts from ZZ_pXModRep
658	664
659	665
660	666

1101	1107	vec_ZZ_pX H;
1102	1108	};
1103	1109
1104		NTL_THREAD_LOCAL extern long ZZ_pXArgBound;
	1110	extern NTL_CHEAP_THREAD_LOCAL long ZZ_pXArgBound;
1105	1111
1106	1112
1107	1113	void build(ZZ_pXArgument& H, const ZZ_pX& h, const ZZ_pXModulus& F, long m);

-3

include/NTL/ZZ_pXFactoring.h less more

67	67	// Uses "Berlekamp" appraoch.
68	68
69	69
70		NTL_THREAD_LOCAL extern long ZZ_pX_BlockingFactor;
	70	extern NTL_CHEAP_THREAD_LOCAL long ZZ_pX_BlockingFactor;
71	71	// Controls GCD blocking for DDF.
72	72
73	73	void DDF(vec_pair_ZZ_pX_long& factors, const ZZ_pX& f, const ZZ_pX& h,

81	81	// Assumes f is monic and square-free, and h = X^p mod f
82	82	// Obsolete: see NewDDF, below.
83	83
84		NTL_THREAD_LOCAL extern long ZZ_pX_GCDTableSize; /* = 4 */
	84	extern NTL_CHEAP_THREAD_LOCAL long ZZ_pX_GCDTableSize; /* = 4 */
85	85	// Controls GCD blocking for NewDDF
86	86
87	87
88		NTL_THREAD_LOCAL extern double ZZ_pXFileThresh;
	88	extern NTL_CHEAP_THREAD_LOCAL double ZZ_pXFileThresh;
89	89	// external files are used for baby/giant steps if size
90	90	// of these tables exceeds ZZ_pXFileThresh KB.
91	91

+106

-33

include/NTL/config.h less more

83	83	#endif
84	84
85	85	#if 0
	86	#define NTL_DISABLE_TLS_HACK
	87
	88	/* Set if you want to compile NTL without "TLS hack"
	89	*
	90	* To re-build after changing this flag: rm *.o; make ntl.a
	91	*/
	92
	93	#endif
	94
	95	#if 0
	96	#define NTL_ENABLE_TLS_HACK
	97
	98	/* Set if you want to compile NTL with "TLS hack"
	99	*
	100	* To re-build after changing this flag: rm *.o; make ntl.a
	101	*/
	102
	103	#endif
	104
	105	#if 0
86	106	#define NTL_THREADS
87	107
88	108	/* Set if you want to compile NTL as a thread-safe library.

103	123
104	124	#endif
105	125
106
107		#if 0
	126	#if 0
	127	#define NTL_THREAD_BOOST
	128
	129	/* Set if you want to compile NTL to exploit threads internally.
	130	*
	131	* To re-build after changing this flag: rm *.o; make ntl.a
	132	*/
	133
	134	#endif
	135	#
	136
	137	#if 1
108	138	#define NTL_GMP_LIP
109	139
110	140	/*

147	177
148	178	#endif
149	179
150		#if 0
151		#define NTL_PCLMUL
152
153		/*
154		* Use this flag for faster GF2X arithmetc.
155		* This enables the use of the PCLMUL instruction on x86-64
156		* machines.
157		*
158		* To re-build after changing this flag:
159		* rm GF2X.o; make ntl.a
160		*/
161
162		#endif
163	180
164	181	#if 0
165	182	#define NTL_LONG_LONG_TYPE long long

300	317	#if 0
301	318	#define NTL_DISABLE_LONGDOUBLE
302	319
303		/* Explicitly disables us of long double arithmetic in the
304		* single-precision modular arithmetic routines
	320	/* Explicitly disables us of long double arithmetic
305	321	*/
306	322
307	323	#endif

310	326	#if 0
311	327	#define NTL_DISABLE_LONGLONG
312	328
313		/* Explicitly disables us of long long arithmetic in the
314		* single-precision modular arithmetic routines
315		*/
316
317		#endif
318
319
	329	/* Explicitly disables us of long long arithmetic
	330	*/
	331
	332	#endif
	333
	334	#if 0
	335	#define NTL_DISABLE_LL_ASM
	336
	337	/* Explicitly disables us of inline assembly as a replacement
	338	* for long lobg arithmetic.
	339	*/
	340
	341	#endif
	342
	343
	344	#if 0
	345	#define NTL_MAXIMIZE_SP_NBITS
	346
	347	/* Allows for 62-bit single-precision moduli on 64-bit platforms.
	348	* By default, such moduli are restricted to 60 bits, which
	349	* usually gives slightly better performance across a range of
	350	* of parameters.
	351	*/
	352
	353	#endif
320	354
321	355	/*************************************************************************
322	356	*

505	539	#endif
506	540
507	541
	542	#if 0
	543	#define NTL_CRT_ALTCODE
	544
	545	/*
	546	* Employs an alternative CRT strategy.
	547	* Only relevant with GMP.
	548	* Seems to be marginally faster on some x86_64 platforms.
	549	*
	550	* To re-build after changing this flag:
	551	* rm lip.o; make ntl.a
	552	*/
	553
	554	#endif
	555
	556	#if 0
	557	#define NTL_CRT_ALTCODE_SMALL
	558
	559	/*
	560	* Employs an alternative CRT strategy for small moduli.
	561	* Only relevant with GMP.
	562	* Seems to be marginally faster on some x86_64 platforms.
	563	*
	564	* To re-build after changing this flag:
	565	* rm lip.o; make ntl.a
	566	*/
	567
	568	#endif
	569
508	570
509	571	#if 0
510	572	#define NTL_GF2X_ALTCODE

547	609	#endif
548	610
549	611
550
551
552
553
554
555
556
557
558		#endif
	612	#if 0
	613	#define NTL_PCLMUL
	614
	615	/*
	616	* Use this flag for faster GF2X arithmetc.
	617	* This enables the use of the PCLMUL instruction on x86-64
	618	* machines.
	619	*
	620	* To re-build after changing this flag:
	621	* rm GF2X.o; make ntl.a
	622	*/
	623
	624	#endif
	625
	626
	627
	628
	629
	630
	631	#endif

+84

-2

include/NTL/ctools.h less more

3	3
4	4	#include <NTL/config.h>
5	5	#include <NTL/mach_desc.h>
6		#include <NTL/have_LL.h>
7		#include <NTL/have_builtin_clzl.h>
	6	#include <NTL/HAVE_LL_TYPE.h>
	7	#include <NTL/HAVE_BUILTIN_CLZL.h>
	8	#include <NTL/HAVE_AVX.h>
	9	#include <NTL/HAVE_FMA.h>
8	10
9	11
10	12	/*

73	75
74	76	#endif
75	77
	78
	79	#ifdef NTL_HAVE_LL_TYPE
	80
	81	typedef NTL_LL_TYPE _ntl_longlong;
	82	typedef NTL_ULL_TYPE _ntl_ulonglong;
	83	// typenames are more convenient than macros
	84
	85	#else
	86
	87	#undef NTL_LL_TYPE
	88	#undef NTL_ULL_TYPE
	89	// prevent any use of these macros
	90
	91	class _ntl_longlong { private: _ntl_longlong() { } };
	92	class _ntl_ulonglong { private: _ntl_ulonglong() { } };
	93	// cannot create variables of these types
	94
	95
	96	#endif
	97
76	98	/********************************************************/
	99
	100
	101
	102	// Define an unsigned type with at least 32 bits
	103	// there is no truly portable way to do this, yet...
	104
	105
	106	#if (NTL_BITS_PER_INT >= 32)
	107
	108	typedef unsigned int _ntl_uint32; // 32-bit word
	109	#define NTL_BITS_PER_INT32 NTL_BITS_PER_INT
	110
	111	#else
	112
	113	// NOTE: C++ standard guarntees longs ar at least 32-bits wide,
	114	// and this is also explicitly checked at builod time
	115
	116	typedef unsigned long _ntl_uint32; // 32-bit word
	117	#define NTL_BITS_PER_INT32 NTL_BITS_PER_LONG
	118
	119	#endif
	120
77	121
78	122
79	123	// The usual token pasting stuff...

280	324
281	325	#define NTL_THREAD_LOCAL thread_local
282	326
	327	#ifdef __GNUC__
	328	#define NTL_CHEAP_THREAD_LOCAL __thread
	329	#else
	330	#define NTL_CHEAP_THREAD_LOCAL thread_local
	331	#endif
	332
283	333	#else
284	334
285	335	#define NTL_THREAD_LOCAL
	336	#define NTL_CHEAP_THREAD_LOCAL
286	337
287	338	#endif
288	339

340	391	as the C++ standard is kind of broken on the issue of where
341	392	swap is defined. And I also only want it defined for built-in types.
342	393	*/
	394
	395
	396
	397
	398	// The following is for aligning small local arrays
	399	// Equivalent to type x[n], but aligns to align bytes
	400	// Only works for POD types
	401	// NOTE: the gcc aligned attribute might work, but there is
	402	// some chatter on the web that this was (at some point) buggy.
	403	// Not clear what the current status is.
	404	// Anyway, this is only intended for use with gcc on intel
	405	// machines, so it should be OK.
	406
	407
	408	#define NTL_ALIGNED_LOCAL_ARRAY(align, x, type, n) \
	409	char x##__ntl_hidden_variable_storage[n*sizeof(type)+align]; \
	410	type x = (type ) ((&x##__ntl_hidden_variable_storage[0]) + \
	411	((-((unsigned long) (&x##__ntl_hidden_variable_storage[0]))) %\
	412	(unsigned long)(align))) \
	413
	414
	415	#define NTL_AVX_BYTE_ALIGN (32)
	416	#define NTL_AVX_DBL_ALIGN (NTL_AVX_BYTE_ALIGN/long(sizeof(double)))
	417
	418	#define NTL_AVX_LOCAL_ARRAY(x, type, n) NTL_ALIGNED_LOCAL_ARRAY(NTL_AVX_BYTE_ALIGN, x, type, n)
	419
	420	#define NTL_DEFAULT_ALIGN (64)
	421	// this should be big enough to satisfy any SIMD instructions,
	422	// and it should also be as big as a cache line
	423
	424
343	425
344	426
345	427	#endif

+106

-33

include/NTL/def_config.h less more

83	83	#endif
84	84
85	85	#if 0
	86	#define NTL_DISABLE_TLS_HACK
	87
	88	/* Set if you want to compile NTL without "TLS hack"
	89	*
	90	* To re-build after changing this flag: rm *.o; make ntl.a
	91	*/
	92
	93	#endif
	94
	95	#if 0
	96	#define NTL_ENABLE_TLS_HACK
	97
	98	/* Set if you want to compile NTL with "TLS hack"
	99	*
	100	* To re-build after changing this flag: rm *.o; make ntl.a
	101	*/
	102
	103	#endif
	104
	105	#if 0
86	106	#define NTL_THREADS
87	107
88	108	/* Set if you want to compile NTL as a thread-safe library.

103	123
104	124	#endif
105	125
106
107		#if 0
	126	#if 0
	127	#define NTL_THREAD_BOOST
	128
	129	/* Set if you want to compile NTL to exploit threads internally.
	130	*
	131	* To re-build after changing this flag: rm *.o; make ntl.a
	132	*/
	133
	134	#endif
	135	#
	136
	137	#if 1
108	138	#define NTL_GMP_LIP
109	139
110	140	/*

147	177
148	178	#endif
149	179
150		#if 0
151		#define NTL_PCLMUL
152
153		/*
154		* Use this flag for faster GF2X arithmetc.
155		* This enables the use of the PCLMUL instruction on x86-64
156		* machines.
157		*
158		* To re-build after changing this flag:
159		* rm GF2X.o; make ntl.a
160		*/
161
162		#endif
163	180
164	181	#if 0
165	182	#define NTL_LONG_LONG_TYPE long long

300	317	#if 0
301	318	#define NTL_DISABLE_LONGDOUBLE
302	319
303		/* Explicitly disables us of long double arithmetic in the
304		* single-precision modular arithmetic routines
	320	/* Explicitly disables us of long double arithmetic
305	321	*/
306	322
307	323	#endif

310	326	#if 0
311	327	#define NTL_DISABLE_LONGLONG
312	328
313		/* Explicitly disables us of long long arithmetic in the
314		* single-precision modular arithmetic routines
315		*/
316
317		#endif
318
319
	329	/* Explicitly disables us of long long arithmetic
	330	*/
	331
	332	#endif
	333
	334	#if 0
	335	#define NTL_DISABLE_LL_ASM
	336
	337	/* Explicitly disables us of inline assembly as a replacement
	338	* for long lobg arithmetic.
	339	*/
	340
	341	#endif
	342
	343
	344	#if 0
	345	#define NTL_MAXIMIZE_SP_NBITS
	346
	347	/* Allows for 62-bit single-precision moduli on 64-bit platforms.
	348	* By default, such moduli are restricted to 60 bits, which
	349	* usually gives slightly better performance across a range of
	350	* of parameters.
	351	*/
	352
	353	#endif
320	354
321	355	/*************************************************************************
322	356	*

505	539	#endif
506	540
507	541
	542	#if 0
	543	#define NTL_CRT_ALTCODE
	544
	545	/*
	546	* Employs an alternative CRT strategy.
	547	* Only relevant with GMP.
	548	* Seems to be marginally faster on some x86_64 platforms.
	549	*
	550	* To re-build after changing this flag:
	551	* rm lip.o; make ntl.a
	552	*/
	553
	554	#endif
	555
	556	#if 0
	557	#define NTL_CRT_ALTCODE_SMALL
	558
	559	/*
	560	* Employs an alternative CRT strategy for small moduli.
	561	* Only relevant with GMP.
	562	* Seems to be marginally faster on some x86_64 platforms.
	563	*
	564	* To re-build after changing this flag:
	565	* rm lip.o; make ntl.a
	566	*/
	567
	568	#endif
	569
508	570
509	571	#if 0
510	572	#define NTL_GF2X_ALTCODE

547	609	#endif
548	610
549	611
550
551
552
553
554
555
556
557
558		#endif
	612	#if 0
	613	#define NTL_PCLMUL
	614
	615	/*
	616	* Use this flag for faster GF2X arithmetc.
	617	* This enables the use of the PCLMUL instruction on x86-64
	618	* machines.
	619	*
	620	* To re-build after changing this flag:
	621	* rm GF2X.o; make ntl.a
	622	*/
	623
	624	#endif
	625
	626
	627
	628
	629
	630
	631	#endif

-1

include/NTL/g_lip.h less more

21	21	#endif
22	22
23	23
24		#if (defined(NTL_HAVE_LL_TYPE) && !defined(NTL_LEGACY_SP_MULMOD) && !defined(NTL_DISABLE_LONGLONG))
	24	#if (defined(NTL_HAVE_LL_TYPE) && !defined(NTL_LEGACY_SP_MULMOD))
25	25
26	26	#define NTL_LONGLONG_SP_MULMOD
27	27

-0

~~include/NTL/have_LL.h~~ less more

(Empty file)

-0

~~include/NTL/have_LL_no.h~~ less more

(Empty file)

-4

~~include/NTL/have_LL_yes.h~~ less more

0		#ifndef NTL_HAVE_LL_TYPE
1		#define NTL_HAVE_LL_TYPE
2		#endif
3

-0

~~include/NTL/have_builtin_clzl.h~~ less more

(Empty file)

-0

~~include/NTL/have_builtin_clzl_no.h~~ less more

(Empty file)

-4

~~include/NTL/have_builtin_clzl_yes.h~~ less more

0		#ifndef NTL_HAVE_BUILTIN_CLZL
1		#define NTL_HAVE_BUILTIN_CLZL
2		#endif
3

+26

-0

include/NTL/lip.h less more

50	50	_ntl_rem_struct_build(long n, NTL_verylong modulus, long (*p)(long));
51	51
52	52
	53	// montgomery
	54	class _ntl_reduce_struct {
	55	public:
	56	virtual ~_ntl_reduce_struct() { }
	57	virtual void eval(NTL_verylong x, NTL_verylong a) = 0;
	58	virtual void adjust(NTL_verylong *x) = 0;
	59	};
	60
	61	_ntl_reduce_struct *
	62	_ntl_reduce_struct_build(NTL_verylong modulus, NTL_verylong excess);
	63
	64
	65	// faster reduction with preconditioning -- general usage, single modulus
	66
	67	class _ntl_general_rem_one_struct {
	68	public:
	69	virtual ~_ntl_general_rem_one_struct() { }
	70	};
	71
	72	_ntl_general_rem_one_struct *
	73	_ntl_general_rem_one_struct_build(long p, long sz);
	74
	75	long
	76	_ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo);
	77
	78
53	79
54	80
55	81

+33

-3

include/NTL/lzz_p.h less more

26	26	mulmod_t pinv;
27	27
28	28	sp_reduce_struct red_struct;
	29	sp_ll_reduce_struct ll_red_struct;
29	30
30	31	FFTPrimeInfo* p_info; // non-null means we are directly using
31	32	// an FFT prime

57	58	Vec<mulmod_precon_t> uqinv; // MulModPrecon for u
58	59	};
59	60
60		NTL_THREAD_LOCAL extern SmartPtr<zz_pInfoT> zz_pInfo; // current modulus, initially null
	61	extern
	62	NTL_CHEAP_THREAD_LOCAL
	63	zz_pInfoT *zz_pInfo;
	64	// current modulus, initially null
61	65
62	66
63	67	class zz_pContext {

122	126	explicit zz_pPush(const zz_pContext& context) { bak.save(); context.restore(); }
123	127
124	128	explicit zz_pPush(long p, long maxroot=NTL_FFTMaxRoot)
125		{ bak.save(); zz_pContext c(p); c.restore(); }
	129	{ bak.save(); zz_pContext c(p, maxroot); c.restore(); }
126	130
127	131	zz_pPush(INIT_FFT_TYPE, long index)
128	132	{ bak.save(); zz_pContext c(INIT_FFT, index); c.restore(); }

179	183	static long modulus() { return zz_pInfo->p; }
180	184	static zz_p zero() { return zz_p(); }
181	185	static mulmod_t ModulusInverse() { return zz_pInfo->pinv; }
	186	static sp_reduce_struct red_struct() { return zz_pInfo->red_struct; }
	187	static sp_ll_reduce_struct ll_red_struct() { return zz_pInfo->ll_red_struct; }
182	188	static long PrimeCnt() { return zz_pInfo->PrimeCnt; }
183	189
184	190

422	428
423	429
424	430	void conv(Vec<zz_p>& x, const Vec<ZZ>& a);
425		// explicit instantiation of more efficient version,
	431	void conv(Vec<zz_p>& x, const Vec<long>& a);
	432	// explicit instantiation of more efficient versions,
426	433	// defined in vec_lzz_p.c
427	434
428	435

441	448	/* ------------------------------------- */
442	449
443	450
	451	// *********************************************************
	452	// *** specialized inner-product routines, for internal consumption
	453	// *********************************************************
	454
	455	#ifdef NTL_HAVE_LL_TYPE
	456	long
	457	InnerProd_LL(const long ap, const zz_p bp, long n, long d,
	458	sp_ll_reduce_struct dinv);
	459
	460	long
	461	InnerProd_LL(const zz_p ap, const zz_p bp, long n, long d,
	462	sp_ll_reduce_struct dinv);
	463	#endif
	464
	465
	466	long
	467	InnerProd_L(const long ap, const zz_p bp, long n, long d,
	468	sp_reduce_struct dinv);
	469
	470	long
	471	InnerProd_L(const zz_p ap, const zz_p bp, long n, long d,
	472	sp_reduce_struct dinv);
	473
444	474
445	475	NTL_CLOSE_NNS
446	476

-2

include/NTL/lzz_pE.h less more

29	29
30	30	};
31	31
32		NTL_THREAD_LOCAL
33		extern SmartPtr<zz_pEInfoT> zz_pEInfo; // info for current modulus, initially null
	32	extern
	33	NTL_CHEAP_THREAD_LOCAL
	34	zz_pEInfoT *zz_pEInfo;
	35	// info for current modulus, initially null
	36	// fast TLS access
34	37
35	38
36	39

-1

include/NTL/lzz_pEX.h less more

842	842	vec_zz_pEX H;
843	843	};
844	844
845		NTL_THREAD_LOCAL extern long zz_pEXArgBound;
	845	extern
	846	NTL_CHEAP_THREAD_LOCAL
	847	long zz_pEXArgBound;
846	848
847	849
848	850	void build(zz_pEXArgument& H, const zz_pEX& h, const zz_pEXModulus& F, long m);

-2

include/NTL/lzz_pEXFactoring.h less more

35	35	// assumes that f is monic and splits into distinct linear factors
36	36
37	37
38		NTL_THREAD_LOCAL extern long zz_pEX_GCDTableSize; /* = 4 */
	38	extern
	39	NTL_CHEAP_THREAD_LOCAL
	40	long zz_pEX_GCDTableSize; /* = 4 */
39	41	// Controls GCD blocking for NewDDF
40	42
41	43
42		NTL_THREAD_LOCAL extern double zz_pEXFileThresh;
	44	extern
	45	NTL_CHEAP_THREAD_LOCAL
	46	double zz_pEXFileThresh;
43	47	// external files are used for baby/giant steps if size
44	48	// of these tables exceeds zz_pEXFileThresh KB.
45	49

+44

-1

include/NTL/lzz_pX.h less more

5	5	#include <NTL/lzz_p.h>
6	6	#include <NTL/vec_lzz_p.h>
7	7	#include <NTL/Lazy.h>
	8	#include <NTL/SmartPtr.h>
8	9
9	10	NTL_OPEN_NNS
10	11

1092	1093	vec_zz_pX H;
1093	1094	};
1094	1095
1095		NTL_THREAD_LOCAL extern long zz_pXArgBound;
	1096	extern
	1097	NTL_CHEAP_THREAD_LOCAL
	1098	long zz_pXArgBound;
1096	1099
1097	1100
1098	1101	void build(zz_pXArgument& H, const zz_pX& h, const zz_pXModulus& F, long m);

1105	1108	inline zz_pX
1106	1109	CompMod(const zz_pX& g, const zz_pXArgument& H, const zz_pXModulus& F)
1107	1110	{ zz_pX x; CompMod(x, g, H, F); NTL_OPT_RETURN(zz_pX, x); }
	1111
	1112
	1113
	1114	// experimental variant that yields a faster ModComp
	1115	// Usage:
	1116	// zz_pXArgument H;
	1117	// build(H, h, F);
	1118	// zz_pXAltArgument H1;
	1119	// build(H1, H, F); // this keeps a pointer to H, so H must remain alive
	1120	// CompMod(x, g, H1, F); // x = g(h) mod f
	1121
	1122	struct zz_pXAltArgument {
	1123
	1124	const zz_pXArgument *orig;
	1125	zz_pXAltArgument() : orig(0) {}
	1126
	1127	#ifdef NTL_HAVE_LL_TYPE
	1128	long strategy;
	1129
	1130	long n, m;
	1131	Vec< Vec<long> > mem;
	1132	Vec<long*> row;
	1133
	1134	// NOTE: the following two members are used on if
	1135	// NTL_HAVE_AVX; however, we declare them unconditionally
	1136	// to facilitate the possibility of dynamic linking based
	1137	// on architecture
	1138	Vec< AlignedArray<double> > dmem;
	1139	Vec<double*> drow;
	1140
	1141	sp_ll_reduce_struct pinv_LL;
	1142	sp_reduce_struct pinv_L;
	1143	#endif
	1144	};
	1145
	1146
	1147	void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F);
	1148	void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	1149	const zz_pXModulus& F);
	1150
1108	1151
1109	1152
1110	1153

-2

include/NTL/lzz_pXFactoring.h less more

73	73
74	74
75	75
76		NTL_THREAD_LOCAL extern long zz_pX_BlockingFactor;
	76	extern
	77	NTL_CHEAP_THREAD_LOCAL
	78	long zz_pX_BlockingFactor;
77	79	// Controls GCD blocking for DDF.
78	80
79	81

89	91	// Obsolete: see NewDDF, below.
90	92
91	93
92		NTL_THREAD_LOCAL extern long zz_pX_GCDTableSize; /* = 4 */
	94	extern
	95	NTL_CHEAP_THREAD_LOCAL
	96	long zz_pX_GCDTableSize; /* = 4 */
93	97	// Controls GCD blocking for NewDDF
94	98
95	99

-0

include/NTL/mat_GF2.h less more

51	51	long IsIdent(const mat_GF2& A, long n);
52	52	void transpose(mat_GF2& X, const mat_GF2& A);
53	53	void solve(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b);
	54	void solve(ref_GF2 d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b);
54	55	void inv(ref_GF2 d, mat_GF2& X, const mat_GF2& A);
55	56
56	57	inline void sqr(mat_GF2& X, const mat_GF2& A)

-2

include/NTL/mat_GF2E.h less more

37	37	void determinant(GF2E& d, const mat_GF2E& A);
38	38	long IsIdent(const mat_GF2E& A, long n);
39	39	void transpose(mat_GF2E& X, const mat_GF2E& A);
40		void solve(GF2E& d, vec_GF2E& X,
41		const mat_GF2E& A, const vec_GF2E& b);
	40	void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b);
	41	void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b);
42	42	void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A);
43	43
44	44	inline void sqr(mat_GF2E& X, const mat_GF2E& A)

-2

include/NTL/mat_ZZ_p.h less more

34	34	void determinant(ZZ_p& d, const mat_ZZ_p& A);
35	35	long IsIdent(const mat_ZZ_p& A, long n);
36	36	void transpose(mat_ZZ_p& X, const mat_ZZ_p& A);
37		void solve(ZZ_p& d, vec_ZZ_p& X,
38		const mat_ZZ_p& A, const vec_ZZ_p& b);
	37	void solve(ZZ_p& d, vec_ZZ_p& X, const mat_ZZ_p& A, const vec_ZZ_p& b);
	38	void solve(ZZ_p& d, const mat_ZZ_p& A, vec_ZZ_p& x, const vec_ZZ_p& b);
39	39	void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A);
40	40
41	41	inline void sqr(mat_ZZ_p& X, const mat_ZZ_p& A)

-2

include/NTL/mat_ZZ_pE.h less more

44	44	inline mat_ZZ_pE transpose(const mat_ZZ_pE& A)
45	45	{ mat_ZZ_pE X; transpose(X, A); NTL_OPT_RETURN(mat_ZZ_pE, X); }
46	46
47		void solve(ZZ_pE& d, vec_ZZ_pE& X,
48		const mat_ZZ_pE& A, const vec_ZZ_pE& b);
	47	void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b);
	48	void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b);
49	49
50	50	void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A);
51	51

+65

-20

include/NTL/mat_lzz_p.h less more

30	30	inline mat_zz_p ident_mat_zz_p(long n)
31	31	{ mat_zz_p X; ident(X, n); NTL_OPT_RETURN(mat_zz_p, X); }
32	32
33		void determinant(zz_p& d, const mat_zz_p& A);
34	33	long IsIdent(const mat_zz_p& A, long n);
35	34	void transpose(mat_zz_p& X, const mat_zz_p& A);
36		void solve(zz_p& d, vec_zz_p& X,
37		const mat_zz_p& A, const vec_zz_p& b);
38		void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A);
	35
	36
	37
	38	// ************************
	39
	40	void relaxed_solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b, bool relax=true);
	41	void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax=true);
	42
	43	void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax=true);
	44	inline void relaxed_inv(mat_zz_p& X, const mat_zz_p& A, bool relax=true)
	45	{ zz_p d; relaxed_inv(d, X, A, relax); if (d == 0) ArithmeticError("inv: non-invertible matrix"); }
	46	inline mat_zz_p relaxed_inv(const mat_zz_p& A, bool relax=true)
	47	{ mat_zz_p X; relaxed_inv(X, A, relax); NTL_OPT_RETURN(mat_zz_p, X); }
	48
	49	void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax=true);
	50	inline zz_p relaxed_determinant(const mat_zz_p& a, bool relax=true)
	51	{ zz_p x; relaxed_determinant(x, a, relax); return x; }
	52
	53	void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax=true);
	54	inline mat_zz_p relaxed_power(const mat_zz_p& A, const ZZ& e, bool relax=true)
	55	{ mat_zz_p X; relaxed_power(X, A, e, relax); NTL_OPT_RETURN(mat_zz_p, X); }
	56	inline void relaxed_power(mat_zz_p& X, const mat_zz_p& A, long e, bool relax=true)
	57	{ relaxed_power(X, A, ZZ_expo(e), relax); }
	58	inline mat_zz_p relaxed_power(const mat_zz_p& A, long e, bool relax=true)
	59	{ mat_zz_p X; relaxed_power(X, A, e, relax); NTL_OPT_RETURN(mat_zz_p, X); }
	60
	61	// ***********************
	62
	63	inline void solve(zz_p& d, vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
	64	{ relaxed_solve(d, x, A, b, false); }
	65
	66	inline void solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b)
	67	{ relaxed_solve(d, A, x, b, false); }
	68
	69	inline void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A)
	70	{ relaxed_inv(d, X, A, false); }
	71
	72	inline void inv(mat_zz_p& X, const mat_zz_p& A)
	73	{ relaxed_inv(X, A, false); }
	74
	75	inline mat_zz_p inv(const mat_zz_p& A)
	76	{ return relaxed_inv(A, false); }
	77
	78	inline void determinant(zz_p& d, const mat_zz_p& A)
	79	{ relaxed_determinant(d, A, false); }
	80
	81	inline zz_p determinant(const mat_zz_p& a)
	82	{ return relaxed_determinant(a, false); }
	83
	84	inline void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e)
	85	{ relaxed_power(X, A, e, false); }
	86
	87	inline mat_zz_p power(const mat_zz_p& A, const ZZ& e)
	88	{ return relaxed_power(A, e, false); }
	89
	90	inline void power(mat_zz_p& X, const mat_zz_p& A, long e)
	91	{ relaxed_power(X, A, e, false); }
	92
	93	inline mat_zz_p power(const mat_zz_p& A, long e)
	94	{ return relaxed_power(A, e, false); }
	95
	96	// ************************
	97
39	98
40	99	inline void sqr(mat_zz_p& X, const mat_zz_p& A)
41	100	{ mul(X, A, A); }

43	102	inline mat_zz_p sqr(const mat_zz_p& A)
44	103	{ mat_zz_p X; sqr(X, A); NTL_OPT_RETURN(mat_zz_p, X); }
45	104
46		void inv(mat_zz_p& X, const mat_zz_p& A);
47
48		inline mat_zz_p inv(const mat_zz_p& A)
49		{ mat_zz_p X; inv(X, A); NTL_OPT_RETURN(mat_zz_p, X); }
50
51		void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e);
52		inline mat_zz_p power(const mat_zz_p& A, const ZZ& e)
53		{ mat_zz_p X; power(X, A, e); NTL_OPT_RETURN(mat_zz_p, X); }
54
55		inline void power(mat_zz_p& X, const mat_zz_p& A, long e)
56		{ power(X, A, ZZ_expo(e)); }
57		inline mat_zz_p power(const mat_zz_p& A, long e)
58		{ mat_zz_p X; power(X, A, e); NTL_OPT_RETURN(mat_zz_p, X); }
	105
	106
59	107
60	108
61	109	void diag(mat_zz_p& X, long n, zz_p d);

74	122
75	123	// miscellaneous:
76	124
77		inline zz_p determinant(const mat_zz_p& a)
78		{ zz_p x; determinant(x, a); return x; }
79		// functional variant of determinant
80	125
81	126	inline mat_zz_p transpose(const mat_zz_p& a)
82	127	{ mat_zz_p x; transpose(x, a); NTL_OPT_RETURN(mat_zz_p, x); }

-2

include/NTL/mat_lzz_pE.h less more

44	44	inline mat_zz_pE transpose(const mat_zz_pE& A)
45	45	{ mat_zz_pE X; transpose(X, A); NTL_OPT_RETURN(mat_zz_pE, X); }
46	46
47		void solve(zz_pE& d, vec_zz_pE& X,
48		const mat_zz_pE& A, const vec_zz_pE& b);
	47	void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b);
	48	void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b);
49	49
50	50	void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A);
51	51

+11

-6

include/NTL/quad_float.h less more

65	65
66	66	inline quad_float& operator=(double x);
67	67
68		NTL_THREAD_LOCAL static long oprec;
	68	static
	69	NTL_CHEAP_THREAD_LOCAL
	70	long oprec;
69	71
70	72	static void SetOutputPrecision(long p);
71	73	static long OutputPrecision() { return oprec; }

77	79
78	80	}; // end class quad_float
79	81
	82
	83
	84
80	85	#if (NTL_BITS_PER_LONG < NTL_DOUBLE_PRECISION)
81	86
82	87	// FIXME: we could make this <=, and even BPL <= DP+1 for

87	92
88	93	#else
89	94
	95
90	96	quad_float to_quad_float(long n);
91	97	quad_float to_quad_float(unsigned long n);
92	98
93	99	#endif
	100
	101
94	102
95	103	#if (NTL_BITS_PER_INT < NTL_DOUBLE_PRECISION)
96	104

108	116
109	117
110	118
111		inline quad_float to_quad_float(double x) { return quad_float(x, 0); }
112		// On platforms with extended doubles, this may result in an
113		// improper quad_float object, but it should be converted to a proper
114		// one when passed by reference to any of the arithmetic routines,
115		// at which time x will be forced to memory.
	119
	120	inline quad_float to_quad_float(double x) { return quad_float(TrueDouble(x), 0); }
116	121
117	122	inline quad_float to_quad_float(float x)
118	123	{ return to_quad_float(double(x)); }

+162

-40

include/NTL/sp_arith.h less more

300	300
301	301	// **********************************************************************
302	302
	303
	304
	305
	306
	307	#ifdef NTL_HAVE_BUILTIN_CLZL
	308
	309	static inline long
	310	sp_CountLeadingZeros(unsigned long x)
	311	{
	312	return __builtin_clzl(x);
	313	}
	314
	315	#else
	316
	317	static inline long
	318	sp_CountLeadingZeros(unsigned long x)
	319	{
	320	long res = NTL_BITS_PER_LONG-NTL_SP_NBITS;
	321	x = x << NTL_BITS_PER_LONG-NTL_SP_NBITS;
	322	while (x < (1UL << (NTL_BITS_PER_LONG-1))) {
	323	x <<= 1;
	324	res++;
	325	}
	326
	327	return res;
	328	}
	329
	330
	331	#endif
303	332
304	333
305	334

627	656	#endif
628	657
629	658
630
631		#ifdef NTL_HAVE_BUILTIN_CLZL
632
633		static inline long
634		sp_CountLeadingZeros(unsigned long x)
635		{
636		return __builtin_clzl(x);
637		}
638
639		#else
640
641		static inline long
642		sp_CountLeadingZeros(unsigned long x)
643		{
644		long res = NTL_BITS_PER_LONG-NTL_SP_NBITS;
645		x = x << NTL_BITS_PER_LONG-NTL_SP_NBITS;
646		while (x < (1UL << (NTL_BITS_PER_LONG-1))) {
647		x <<= 1;
648		res++;
649		}
650
651		return res;
652		}
653
654
655		#endif
656
657
658
659	659	static inline sp_inverse
660	660	PrepMulMod(long n)
661	661	{

673	673	static inline long
674	674	sp_NormalizedMulMod(long a, long b, long n, unsigned long ninv)
675	675	{
676		NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(a)) * ((NTL_ULL_TYPE) cast_unsigned(b));
677		unsigned long H = U >> (NTL_SP_NBITS-2);
	676	ll_type U;
	677	ll_imul(U, a, b);
	678	unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
678	679	unsigned long Q = MulHiUL(H, ninv);
679	680	Q = Q >> NTL_POST_SHIFT;
680		unsigned long L = U;
	681	unsigned long L = ll_get_lo(U);
681	682	long r = L - Qcast_unsigned(n); // r in [0..2n)
682	683
683	684	r = sp_CorrectExcess(r, n);
684	685	return r;
685	686	}
	687
	688
686	689
687	690	static inline long
688	691	MulMod(long a, long b, long n, sp_inverse ninv)

708	711	static inline long
709	712	sp_NormalizedMulModWithQuo(long& qres, long a, long b, long n, unsigned long ninv)
710	713	{
711		NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(a)) * ((NTL_ULL_TYPE) cast_unsigned(b));
712		unsigned long H = U >> (NTL_SP_NBITS-2);
	714	ll_type U;
	715	ll_imul(U, a, b);
	716	unsigned long H = ll_rshift_get_lo<NTL_SP_NBITS-2>(U);
713	717	unsigned long Q = MulHiUL(H, ninv);
714	718	Q = Q >> NTL_POST_SHIFT;
715		unsigned long L = U;
	719	unsigned long L = ll_get_lo(U);
716	720	long r = L - Qcast_unsigned(n); // r in [0..2n)
717	721
718	722	r = sp_CorrectExcessQuo(Q, r, n);

753	757
754	758	#else
755	759
	760
756	761	static inline unsigned long
757	762	sp_NormalizedPrepMulModPrecon(long b, long n, unsigned long ninv)
758	763	{
759		NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << NTL_SP_NBITS;
760		unsigned long H = U >> (NTL_SP_NBITS-2);
	764	unsigned long H = cast_unsigned(b) << 2;
761	765	unsigned long Q = MulHiUL(H, ninv);
762	766	Q = Q >> NTL_POST_SHIFT;
763		unsigned long L = U;
	767	unsigned long L = cast_unsigned(b) << NTL_SP_NBITS;
764	768	long r = L - Qcast_unsigned(n); // r in [0..2n)
765	769
766	770
767	771	Q += 1L + sp_SignMask(r-n);
768	772	return Q; // NOTE: not shifted
769	773	}
	774
770	775
771	776	static inline unsigned long
772	777	PrepMulModPrecon(long b, long n, sp_inverse ninv)

1004	1009	#endif
1005	1010
1006	1011
1007
1008
	1012	#ifdef NTL_HAVE_LL_TYPE
	1013
	1014	#define NTL_HAVE_SP_LL_ROUTINES
	1015
	1016
	1017	// some routines that are currently not part of the documented
	1018	// interface. They currently are only defined when we have appropriate
	1019	// LL type.
	1020
	1021
	1022	struct sp_ll_reduce_struct {
	1023	unsigned long inv;
	1024	long nbits;
	1025
	1026	sp_ll_reduce_struct() { }
	1027
	1028	sp_ll_reduce_struct(unsigned long _inv, long _nbits) : inv(_inv), nbits(_nbits) { }
	1029
	1030	};
	1031
	1032
	1033	static inline sp_ll_reduce_struct
	1034	make_sp_ll_reduce_struct(long n)
	1035	{
	1036	long nbits = NTL_BITS_PER_LONG - sp_CountLeadingZeros(n);
	1037	unsigned long inv =
	1038	(unsigned long) ( ((((NTL_ULL_TYPE) 1) << (nbits+NTL_BITS_PER_LONG))-1UL) / ((NTL_ULL_TYPE) n) );
	1039
	1040	return sp_ll_reduce_struct(inv, nbits);
	1041	}
	1042
	1043
	1044	// computes remainder (hi, lo) mod d, assumes hi < d
	1045	static inline long
	1046	sp_ll_red_21(unsigned long hi, unsigned long lo, long d,
	1047	sp_ll_reduce_struct dinv)
	1048	{
	1049	unsigned long H =
	1050	(hi << (NTL_BITS_PER_LONG-dinv.nbits)) \| (lo >> dinv.nbits);
	1051	unsigned long Q = MulHiUL(H, dinv.inv) + H;
	1052	unsigned long rr = lo - Qcast_unsigned(d); // rr in [0..4d)
	1053	long r = sp_CorrectExcess(rr, 2d); // r in [0..2d)
	1054	r = sp_CorrectExcess(r, d);
	1055	return r;
	1056	}
	1057
	1058	// computes remainder (x[n-1], ..., x[0]) mod d
	1059	static inline long
	1060	sp_ll_red_n1(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
	1061	{
	1062	long carry = 0;
	1063	long i;
	1064	for (i = n-1; i >= 0; i--)
	1065	carry = sp_ll_red_21(carry, x[i], d, dinv);
	1066	return carry;
	1067	}
	1068
	1069	// computes remainder (x2, x1, x0) mod d, assumes x2 < d
	1070	static inline long
	1071	sp_ll_red_31(unsigned long x2, unsigned long x1, unsigned long x0,
	1072	long d, sp_ll_reduce_struct dinv)
	1073	{
	1074	long carry = sp_ll_red_21(x2, x1, d, dinv);
	1075	return sp_ll_red_21(carry, x0, d, dinv);
	1076	}
	1077
	1078
	1079	// normalized versions of the above: assume NumBits(d) == NTL_SP_NBITS
	1080
	1081	// computes remainder (hi, lo) mod d, assumes hi < d
	1082	static inline long
	1083	sp_ll_red_21_normalized(unsigned long hi, unsigned long lo, long d,
	1084	sp_ll_reduce_struct dinv)
	1085	{
	1086	unsigned long H =
	1087	(hi << (NTL_BITS_PER_LONG-NTL_SP_NBITS)) \| (lo >> NTL_SP_NBITS);
	1088	unsigned long Q = MulHiUL(H, dinv.inv) + H;
	1089	unsigned long rr = lo - Qcast_unsigned(d); // rr in [0..4d)
	1090	long r = sp_CorrectExcess(rr, 2d); // r in [0..2d)
	1091	r = sp_CorrectExcess(r, d);
	1092	return r;
	1093	}
	1094
	1095	// computes remainder (x[n-1], ..., x[0]) mod d
	1096	static inline long
	1097	sp_ll_red_n1_normalized(const unsigned long *x, long n, long d, sp_ll_reduce_struct dinv)
	1098	{
	1099	long carry = 0;
	1100	long i;
	1101	for (i = n-1; i >= 0; i--)
	1102	carry = sp_ll_red_21_normalized(carry, x[i], d, dinv);
	1103	return carry;
	1104	}
	1105
	1106	// computes remainder (x2, x1, x0) mod d, assumes x2 < d
	1107	static inline long
	1108	sp_ll_red_31_normalized(unsigned long x2, unsigned long x1, unsigned long x0,
	1109	long d, sp_ll_reduce_struct dinv)
	1110	{
	1111	long carry = sp_ll_red_21_normalized(x2, x1, d, dinv);
	1112	return sp_ll_red_21_normalized(carry, x0, d, dinv);
	1113	}
	1114
	1115
	1116	#else
	1117
	1118	// provided to streamline some code
	1119
	1120
	1121	struct sp_ll_reduce_struct { };
	1122
	1123
	1124	static inline sp_ll_reduce_struct
	1125	make_sp_ll_reduce_struct(long n)
	1126	{
	1127	return sp_ll_reduce_struct();
	1128	}
	1129
	1130	#endif
1009	1131
1010	1132
1011	1133	NTL_CLOSE_NNS

-2

include/NTL/thread.h less more

196	196	#endif
197	197
198	198
199
200
201	199	const NTL_SNS string& CurrentThreadID();
	200
	201
202	202
203	203
204	204	/*********************************************************************

+357

-10

include/NTL/tools.h less more

	0
	1
	2
0	3
1	4	#ifndef NTL_tools__H
2	5	#define NTL_tools__H

12	15
13	16	#include <cstdlib>
14	17	#include <cmath>
	18
	19
	20
	21	#if (defined(NTL_THREADS) && defined(__GNUC__) && !defined(NTL_DISABLE_TLS_HACK))
	22	#define NTL_TLS_HACK
	23	#endif
	24
	25
	26
	27	#ifdef NTL_TLS_HACK
	28	#include <pthread.h>
	29	#endif
15	30
16	31
17	32

124	139
125	140
126	141
127		#define NTL_FILE_THRESH (128000.0)
128		// threshold in KB for switching to external storage of certain
129		// tables (currently in the DDF polynomial factoring routines)
	142	#define NTL_FILE_THRESH (1e12)
	143	// threshold in KB for switching to external storage of certain tables
130	144
131	145
132	146

191	205	inline long min(long a, int b) { return (a < b) ? a : long(b); }
192	206	inline long max(long a, int b) { return (a < b) ? long(b) : a; }
193	207
194		#endif
195
196
	208	inline unsigned int min(unsigned int a, unsigned int b)
	209	{ return (a < b) ? a : b; }
	210	inline unsigned int max(unsigned int a, unsigned int b)
	211	{ return (a < b) ? b : a; }
	212
	213	inline unsigned long min(unsigned long a, unsigned long b)
	214	{ return (a < b) ? a : b; }
	215	inline unsigned long max(unsigned long a, unsigned long b)
	216	{ return (a < b) ? b : a; }
	217
	218	inline unsigned long min(unsigned int a, unsigned long b)
	219	{ return (a < b) ? (unsigned long)(a) : b; }
	220	inline unsigned long max(unsigned int a, unsigned long b)
	221	{ return (a < b) ? b : (unsigned long)(a); }
	222
	223	inline unsigned long min(unsigned long a, unsigned int b)
	224	{ return (a < b) ? a : (unsigned long)(b); }
	225	inline unsigned long max(unsigned long a, unsigned int b)
	226	{ return (a < b) ? (unsigned long)(b) : a; }
	227
	228	#endif
	229
	230
	231	// NOTE: these are here for historical reasons, so I'll leave them
	232	// Since it is likely to lead to ambiguities with std::swap,
	233	// I am not defining any more of these.
197	234	inline void swap(long& a, long& b) { long t; t = a; a = b; b = t; }
198	235	inline void swap(int& a, int& b) { int t; t = a; a = b; b = t; }
199	236

384	421	inline void ForceToMem(double *p) { }
385	422
386	423	#endif
	424
	425
	426	inline double TrueDouble(double x)
	427	{
	428	ForceToMem(&x);
	429	return x;
	430	}
	431
387	432
388	433
389	434

485	530
486	531
487	532
488		NTL_THREAD_LOCAL extern void (*ErrorCallback)();
489
490		NTL_THREAD_LOCAL extern void (ErrorMsgCallback)(const char );
	533	extern NTL_CHEAP_THREAD_LOCAL void (*ErrorCallback)();
	534
	535	extern NTL_CHEAP_THREAD_LOCAL void (ErrorMsgCallback)(const char );
491	536
492	537
493	538	void TerminalError(const char *s);

551	596
552	597	struct scope_guard_builder {
553	598	const char *info;
554		scope_guard_builder(const char *_info) : info(_info) { }
	599	explicit scope_guard_builder(const char *_info) : info(_info) { }
555	600	};
556	601
557	602	template < typename F >

584	629
585	630
586	631
	632
	633	#ifdef NTL_TLS_HACK
	634
	635
	636	namespace details_pthread {
	637
	638
	639	template<class T> void do_delete_aux(T* t) noexcept { delete t; }
	640	// an exception here would likely lead to a complete mess...
	641	// the noexcept specification should force an immediate termination
	642
	643	template<class T> void do_delete(void* t) { do_delete_aux((T*)t); }
	644
	645	using namespace std;
	646	// I'm not sure if pthread stuff might be placed in namespace std
	647
	648	struct key_wrapper {
	649	pthread_key_t key;
	650
	651	key_wrapper(void (destructor)(void))
	652	{
	653	if (pthread_key_create(&key, destructor))
	654	ResourceError("pthread_key_create failed");
	655	}
	656
	657	template<class T>
	658	T* set(T *p)
	659	{
	660	if (!p) MemoryError();
	661	if (pthread_setspecific(key, p)) {
	662	do_delete_aux(p);
	663	ResourceError("pthread_setspecific failed");
	664	}
	665	return p;
	666	}
	667
	668	};
	669
	670	}
	671
	672
	673	#define NTL_TLS_LOCAL_INIT(type, var, init) \
	674	static NTL_CHEAP_THREAD_LOCAL type *_ntl_hidden_variable_tls_local_ptr_ ## var = 0; \
	675	type *_ntl_hidden_variable_tls_local_ptr1_ ## var = _ntl_hidden_variable_tls_local_ptr_ ## var; \
	676	if (!_ntl_hidden_variable_tls_local_ptr1_ ## var) { \
	677	static details_pthread::key_wrapper hidden_variable_key(details_pthread::do_delete<type>); \
	678	type *_ntl_hidden_variable_tls_local_ptr2_ ## var = hidden_variable_key.set(NTL_NEW_OP type init); \
	679	_ntl_hidden_variable_tls_local_ptr1_ ## var = _ntl_hidden_variable_tls_local_ptr2_ ## var; \
	680	_ntl_hidden_variable_tls_local_ptr_ ## var = _ntl_hidden_variable_tls_local_ptr1_ ## var; \
	681	} \
	682	type &var = *_ntl_hidden_variable_tls_local_ptr1_ ## var \
	683
	684
	685
	686	#else
	687
	688
	689	// NOTE: this definition of NTL_TLS_LOCAL_INIT ensures that var names
	690	// a local reference, regardless of the implementation
	691	#define NTL_TLS_LOCAL_INIT(type,var,init) \
	692	static NTL_THREAD_LOCAL type _ntl_hidden_variable_tls_local ## var init; \
	693	type &var = _ntl_hidden_variable_tls_local ## var
	694
	695
	696
	697
	698	#endif
	699
	700	#define NTL_EMPTY_ARG
	701	#define NTL_TLS_LOCAL(type,var) NTL_TLS_LOCAL_INIT(type,var,NTL_EMPTY_ARG)
	702
	703	#define NTL_TLS_GLOBAL_DECL_INIT(type,var,init) \
	704	typedef type _ntl_hidden_typedef_tls_access_ ## var; \
	705	static inline \
	706	type& _ntl_hidden_function_tls_access_ ## var() { \
	707	NTL_TLS_LOCAL_INIT(type,var,init); \
	708	return var; \
	709	} \
	710
	711
	712	#define NTL_TLS_GLOBAL_DECL(type,var) NTL_TLS_GLOBAL_DECL_INIT(type,var,NTL_EMPTY_ARG)
	713
	714	#define NTL_TLS_GLOBAL_ACCESS(var) \
	715	_ntl_hidden_typedef_tls_access_ ## var & var = _ntl_hidden_function_tls_access_ ## var()
	716
	717
	718	// **************************************************************
	719	// Following is code for "long long" arithmetic that can
	720	// be implemented using NTL_ULL_TYPE or using assembly.
	721	// I have found that the assembly can be a bit faster.
	722	// For now, this code is only available if NTL_HAVE_LL_TYPE
	723	// is defined. This could change. In any case, this provides
	724	// a cleaner interface and might eventually allow for
	725	// implementation on systems that don't provide a long long type.
	726	// **************************************************************
	727
	728	#ifdef NTL_HAVE_LL_TYPE
	729
	730
	731	#if (!defined(NTL_DISABLE_LL_ASM) \
	732	&& defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
	733	&& defined (__x86_64__) && NTL_BITS_PER_LONG == 64)
	734
	735	// NOTE: clang's and icc's inline asm code gen is pretty bad, so
	736	// we don't even try.
	737
	738	// FIXME: probably, this should all be properly tested for speed (and correctness)
	739	// using the Wizard.
	740
	741
	742	struct ll_type {
	743	unsigned long hi, lo;
	744	};
	745
	746
	747	static inline void
	748	ll_mul_add(ll_type& x, unsigned long a, unsigned long b)
	749	{
	750	unsigned long hi, lo;
	751	__asm__ (
	752	"mulq %[b] \n\t"
	753	"addq %[lo],%[xlo] \n\t"
	754	"adcq %[hi],%[xhi]" :
	755	[lo] "=a" (lo), [hi] "=d" (hi), [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
	756	[a] "%[lo]" (a), [b] "rm" (b) :
	757	"cc"
	758	);
	759	}
	760
	761	static inline void
	762	ll_imul_add(ll_type& x, unsigned long a, unsigned long b)
	763	{
	764	unsigned long hi, lo;
	765	__asm__ (
	766	"imulq %[b] \n\t"
	767	"addq %[lo],%[xlo] \n\t"
	768	"adcq %[hi],%[xhi]" :
	769	[lo] "=a" (lo), [hi] "=d" (hi), [xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
	770	[a] "%[lo]" (a), [b] "rm" (b) :
	771	"cc"
	772	);
	773	}
	774
	775	static inline void
	776	ll_mul(ll_type& x, unsigned long a, unsigned long b)
	777	{
	778	__asm__ (
	779	"mulq %[b]" :
	780	[lo] "=a" (x.lo), [hi] "=d" (x.hi) :
	781	[a] "%[lo]" (a), [b] "rm" (b) :
	782	"cc"
	783	);
	784	}
	785
	786	static inline void
	787	ll_imul(ll_type& x, unsigned long a, unsigned long b)
	788	{
	789	__asm__ (
	790	"imulq %[b]" :
	791	[lo] "=a" (x.lo), [hi] "=d" (x.hi) :
	792	[a] "%[lo]" (a), [b] "rm" (b) :
	793	"cc"
	794	);
	795	}
	796
	797	static inline void
	798	ll_add(ll_type& x, unsigned long a)
	799	{
	800	__asm__ (
	801	"addq %[a],%[xlo] \n\t"
	802	"adcq %[z],%[xhi]" :
	803	[xhi] "+r" (x.hi), [xlo] "+r" (x.lo) :
	804	[a] "rm" (a), [z] "i" (0) :
	805	"cc"
	806	);
	807	}
	808
	809
	810
	811	// NOTE: an optimizing compiler will remove the conditional.
	812	// The alternative would be to make a specialization for shamt=0.
	813	// Unfortunately, this is impossible to do across a wide range
	814	// of compilers and still maintain internal linkage --- it is not
	815	// allowed to include static spec in the specialization (new compilers
	816	// will complain) and without it, some older compilers will generate
	817	// an external symbol. In fact, NTL currently never calls
	818	// this with shamt=0, so it is all rather academic...but I want to
	819	// keep this general for future use.
	820	template<long shamt>
	821	static inline unsigned long
	822	ll_rshift_get_lo(ll_type x)
	823	{
	824	if (shamt) {
	825	__asm__ (
	826	"shrdq %[shamt],%[hi],%[lo]" :
	827	[lo] "+r" (x.lo) :
	828	[shamt] "i" (shamt), [hi] "r" (x.hi) :
	829	"cc"
	830	);
	831	}
	832	return x.lo;
	833	}
	834
	835
	836	static inline unsigned long
	837	ll_get_lo(const ll_type& x)
	838	{
	839	return x.lo;
	840	}
	841
	842	static inline unsigned long
	843	ll_get_hi(const ll_type& x)
	844	{
	845	return x.hi;
	846	}
	847
	848
	849	static inline void
	850	ll_init(ll_type& x, unsigned long a)
	851	{
	852	x.lo = a;
	853	x.hi = 0;
	854	}
	855
	856	#else
	857
	858
	859	typedef NTL_ULL_TYPE ll_type;
	860
	861	// NOTE: the following functions definitions should serve as
	862	// documentation, as well.
	863
	864	static inline void
	865	ll_mul_add(ll_type& x, unsigned long a, unsigned long b)
	866	{
	867	x += ((ll_type) a)*((ll_type) b);
	868	}
	869
	870	// a and b should be representable as positive long's,
	871	// to allow for the most flexible implementation
	872	static inline void
	873	ll_imul_add(ll_type& x, unsigned long a, unsigned long b)
	874	{
	875	x += ((ll_type) long(a))*((ll_type) long(b));
	876	}
	877	static inline void
	878	ll_mul(ll_type& x, unsigned long a, unsigned long b)
	879	{
	880	x = ((ll_type) a)*((ll_type) b);
	881	}
	882
	883	// a and b should be representable as positive long's,
	884	// to allow for the most flexible implementation
	885	static inline void
	886	ll_imul(ll_type& x, unsigned long a, unsigned long b)
	887	{
	888	x = ((ll_type) long(a))*((ll_type) long(b));
	889	}
	890
	891	static inline void
	892	ll_add(ll_type& x, unsigned long a)
	893	{
	894	x += a;
	895	}
	896
	897	template<long shamt>
	898	static inline unsigned long
	899	ll_rshift_get_lo(const ll_type& x)
	900	{
	901	return ((unsigned long) (x >> shamt));
	902	}
	903
	904	static inline unsigned long
	905	ll_get_lo(const ll_type& x)
	906	{
	907	return ((unsigned long) x);
	908	}
	909
	910	static inline unsigned long
	911	ll_get_hi(const ll_type& x)
	912	{
	913	return ((unsigned long) (x >> NTL_BITS_PER_LONG));
	914	}
	915
	916
	917	static inline void
	918	ll_init(ll_type& x, unsigned long a)
	919	{
	920	x = a;
	921	}
	922
	923
	924	#endif
	925
	926
	927
	928	#endif
	929
	930
	931
	932
	933
587	934	NTL_CLOSE_NNS
588	935
589	936

-0

include/NTL/vector.h less more

58	58	{
59	59	for (long i = 0; i < n; i++)
60	60	p[i].~T();
	61
	62	// NOTE: this routine is only invoked through a Vec destructor
	63	// or a scope guard destructor, both of which are noexcept destructors.
	64	// therefore, if ~T() should throw, the program will terminate
61	65	}
62	66
63	67

-2

include/NTL/version.h less more

1	1	#ifndef NTL_version__H
2	2	#define NTL_version__H
3	3
4		#define NTL_VERSION "9.3.0"
	4	#define NTL_VERSION "9.9.0"
5	5
6	6	#define NTL_MAJOR_VERSION (9)
7		#define NTL_MINOR_VERSION (3)
	7	#define NTL_MINOR_VERSION (9)
8	8	#define NTL_REVISION (0)
9	9
10	10	#endif

-1

include/NTL/xdouble.h less more

41	41
42	42	void normalize();
43	43
44		NTL_THREAD_LOCAL static long oprec;
	44	static
	45	NTL_CHEAP_THREAD_LOCAL
	46	long oprec;
45	47
46	48	static void SetOutputPrecision(long p);
47	49	static long OutputPrecision() { return oprec; }

+32

-0

src/BasicThreadPool.c less more

	0
	1	#include <NTL/BasicThreadPool.h>
	2
	3	#ifdef NTL_THREAD_BOOST
	4
	5	NTL_START_IMPL
	6
	7
	8	NTL_TLS_GLOBAL_DECL(UniquePtr<BasicThreadPool>, NTLThreadPool_stg)
	9
	10	NTL_CHEAP_THREAD_LOCAL BasicThreadPool *NTLThreadPool_ptr = 0;
	11
	12	void ResetThreadPool(BasicThreadPool *pool)
	13	{
	14	NTL_TLS_GLOBAL_ACCESS(NTLThreadPool_stg);
	15	NTLThreadPool_stg.reset(pool);
	16	NTLThreadPool_ptr = pool;
	17	}
	18
	19	BasicThreadPool *ReleaseThreadPool()
	20	{
	21	NTL_TLS_GLOBAL_ACCESS(NTLThreadPool_stg);
	22	BasicThreadPool *pool = NTLThreadPool_stg.release();
	23	NTLThreadPool_ptr = 0;
	24	return pool;
	25	}
	26
	27
	28
	29	NTL_END_IMPL
	30
	31	#endif

-6

src/BitMatTest.c less more

76	76	random(a, n, m);
77	77
78	78	t = GetTime();
79		kernel(x, a);
	79	image(x, a);
80	80	t = GetTime() - t; cerr << t << "\n";
81	81
82	82	cvt(A, a);
83	83
84	84	t = GetTime();
85		kernel(X, A);
	85	image(X, A);
86	86	t = GetTime() - t; cerr << t << "\n";
87
88		cerr << x.NumRows() << "\n";
89	87
90	88	cvt(X1, x);
91	89
92	90	if (X1 != X) TerminalError("BitMatTest NOT OK!!");
93
94		if (!IsZero(X*A)) TerminalError("BitMatTest NOT OK!!");
95	91
96	92	cerr << "\n";
97	93	}

+63

-0

src/CheckAVX.c less more

	0	#include <NTL/ctools.h>
	1
	2	#include <cstdlib>
	3	#include <immintrin.h>
	4	#include <iostream>
	5
	6
	7	#if (!defined(__GNUC__) \|\| !defined(__x86_64__) \|\| !defined(__AVX__))
	8	#error "AVX not supported"
	9	#endif
	10
	11	#if (NTL_BITS_PER_LONG != 64 \|\| NTL_DOUBLE_PRECISION != 53)
	12	#error "AVX not supported"
	13	// sanity check -- code that uses this feature also relies on this
	14	#endif
	15
	16	using namespace std;
	17
	18	void fun(double * x, const double a, const double b)
	19	{
	20	__m256d xvec, avec, bvec, cvec;
	21
	22	avec = _mm256_load_pd(a);
	23	bvec = _mm256_load_pd(b);
	24	xvec = _mm256_load_pd(x);
	25
	26	xvec = _mm256_add_pd(_mm256_mul_pd(avec, bvec), xvec);
	27
	28	_mm256_store_pd(x, xvec);
	29	}
	30	int main()
	31	{
	32	NTL_AVX_LOCAL_ARRAY(vp, double, 12);
	33
	34	double a = vp + 04;
	35	double b = vp + 14;
	36	double x = vp + 24;
	37
	38	a[0] = atoi("1");
	39	a[1] = atoi("2");
	40	a[2] = atoi("3");
	41	a[3] = atoi("4");
	42
	43	b[0] = atoi("2");
	44	b[1] = atoi("3");
	45	b[2] = atoi("4");
	46	b[3] = atoi("5");
	47
	48	x[0] = atoi("3");
	49	x[1] = atoi("4");
	50	x[2] = atoi("5");
	51	x[3] = atoi("6");
	52
	53	fun(x, a, b);
	54
	55	if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26)
	56	return 0;
	57	else
	58	return -1;
	59	}
	60
	61
	62

-0

src/CheckCompile.c less more

	0
	1	int main() { return 0; }

+65

-0

src/CheckFMA.c less more

	0
	1	#include <NTL/ctools.h>
	2
	3	#include <cstdlib>
	4	#include <immintrin.h>
	5	#include <iostream>
	6
	7
	8	#if (!defined(__GNUC__) \|\| !defined(__x86_64__) \|\| !defined(__AVX2__))
	9	#error "AVX2 with FMA not supported"
	10	#endif
	11
	12	#if (NTL_BITS_PER_LONG != 64 \|\| NTL_DOUBLE_PRECISION != 53)
	13	#error "AVX2 with FMA not supported"
	14	// sanity check -- code that uses this feature also relies on this
	15	#endif
	16
	17	using namespace std;
	18
	19
	20	void fun(double * x, const double a, const double b)
	21	{
	22	__m256d xvec, avec, bvec, cvec;
	23
	24	avec = _mm256_load_pd(a);
	25	bvec = _mm256_load_pd(b);
	26	xvec = _mm256_load_pd(x);
	27
	28	xvec = _mm256_fmadd_pd(avec, bvec, xvec);
	29
	30	_mm256_store_pd(x, xvec);
	31	}
	32	int main()
	33	{
	34	NTL_AVX_LOCAL_ARRAY(vp, double, 12);
	35
	36	double a = vp + 04;
	37	double b = vp + 14;
	38	double x = vp + 24;
	39
	40	a[0] = atoi("1");
	41	a[1] = atoi("2");
	42	a[2] = atoi("3");
	43	a[3] = atoi("4");
	44
	45	b[0] = atoi("2");
	46	b[1] = atoi("3");
	47	b[2] = atoi("4");
	48	b[3] = atoi("5");
	49
	50	x[0] = atoi("3");
	51	x[1] = atoi("4");
	52	x[2] = atoi("5");
	53	x[3] = atoi("6");
	54
	55	fun(x, a, b);
	56
	57	if (x[0] == 5 && x[1] == 10 && x[2] == 17 && x[3] == 26)
	58	return 0;
	59	else
	60	return -1;
	61	}
	62
	63
	64

+10

-0

src/CheckLL.c less more

	0
	1
	2	#define NTL_HAVE_LL_TYPE
	3	// DIRT: we need to define this here so that ctools.h
	4	// does not undefine the LL type macros
	5
0	6	#include <NTL/ctools.h>
	7
	8	#ifdef NTL_DISABLE_LONGLONG
	9	#error "LL_TYPE disabled"
	10	#endif
1	11
2	12	void touch(unsigned long& x);
3	13	void touch(long& x);

-54

~~src/CheckPCLMUL.c~~ less more

0		#include <iostream>
1		#include <wmmintrin.h>
2
3
4		using namespace std;
5
6
7
8		void
9		pclmul_mul1 (unsigned long *c, unsigned long a, unsigned long b)
10		{
11		__m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0));
12		__m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
13		_mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
14		}
15
16
17		int main()
18		{
19		cout << "Running CheckPCLMUL...";
20
21		// make sure longs are 64 bit
22		// this runs before mach_desc.h is built, so we calculate
23		// bits-per-long here...in not quite as paranoid a fashion
24		// as in MakeDesc.c. On any standard-compliant compiler,
25		// it should be correct.
26
27		unsigned long ulval = 1;
28		long bpl = 0;
29
30		while (ulval) {
31		ulval <<= 1;
32		bpl++;
33		}
34
35		if (bpl != 64) {
36		cout << "bad (only works with 64-bit longs)\n";
37		return 1;
38		}
39
40		unsigned long c[2], a, b;
41		a = 3;
42		b = 3;
43		pclmul_mul1(c, a, b);
44		if (c[0] == 5 && c[1] == 0) {
45		cout << "good\n";
46		return 0;
47		}
48		else {
49		cout << "bad\n";
50		return 1;
51		}
52		}
53

-0

src/CopyFeatures less more

	0
	1	cp "$1/include/NTL/HAVE_LL_TYPE.h" "$2/include/NTL/HAVE_LL_TYPE.h"
	2	cp "$1/include/NTL/HAVE_BUILTIN_CLZL.h" "$2/include/NTL/HAVE_BUILTIN_CLZL.h"
	3	cp "$1/include/NTL/HAVE_AVX.h" "$2/include/NTL/HAVE_AVX.h"
	4	cp "$1/include/NTL/HAVE_FMA.h" "$2/include/NTL/HAVE_FMA.h"

-1

src/DIRNAME less more

0		ntl-9.3.0
	0	ntl-9.9.0

+34

-4

src/DispSettings.c less more

12	12	{
13	13
14	14	cout << "\n\n";
	15	cout << "/***************************\n";
15	16	cout << "Basic Configuration Options:\n";
16	17
17	18

29	30	cout << "NTL_THREADS\n";
30	31	#endif
31	32
	33	#ifdef NTL_DISABLE_TLS_HACK
	34	cout << "NTL_DISABLE_TLS_HACK\n";
	35	#endif
	36
	37	#ifdef NTL_ENABLE_TLS_HACK
	38	cout << "NTL_ENABLE_TLS_HACK\n";
	39	#endif
	40
32	41	#ifdef NTL_EXCEPTIONS
33	42	cout << "NTL_EXCEPTIONS\n";
	43	#endif
	44
	45	#ifdef NTL_THREAD_BOOST
	46	cout << "NTL_THREAD_BOOST\n";
34	47	#endif
35	48
36	49

41	54
42	55	#ifdef NTL_GF2X_LIB
43	56	cout << "NTL_GF2X_LIB\n";
44		#endif
45
46		#ifdef NTL_PCLMUL
47		cout << "NTL_PCLMUL\n";
48	57	#endif
49	58
50	59	#ifdef NTL_LONG_LONG_TYPE

96	105	cout << "NTL_DISABLE_LONGLONG\n";
97	106	#endif
98	107
	108	#ifdef NTL_DISABLE_LL_ASM
	109	cout << "NTL_DISABLE_LL_ASM\n";
	110	#endif
	111
	112	#ifdef NTL_MAXIMIZE_SP_NBITS
	113	cout << "NTL_MAXIMIZE_SP_NBITS\n";
	114	#endif
99	115
100	116
101	117	cout << "\n";

147	163	cout << "NTL_TBL_REM_LL\n";
148	164	#endif
149	165
	166	#ifdef NTL_CRT_ALTCODE
	167	cout << "NTL_CRT_ALTCODE\n";
	168	#endif
	169
	170	#ifdef NTL_CRT_ALTCODE_SMALL
	171	cout << "NTL_CRT_ALTCODE_SMALL\n";
	172	#endif
	173
150	174
151	175	#ifdef NTL_GF2X_ALTCODE
152	176	cout << "NTL_GF2X_ALTCODE\n";

162	186	cout << "NTL_GF2X_NOINLINE\n";
163	187	#endif
164	188
	189	#ifdef NTL_PCLMUL
	190	cout << "NTL_PCLMUL\n";
	191	#endif
	192
	193
	194	cout << "***************************/\n";
165	195	cout << "\n\n";
166	196
167	197	return 0;

+202

-127

src/DoConfig less more

2	2	# use warnings; # this doesn't work on older versions of perl
3	3
4	4
5		sub RemoveProg {
6
7		# This should work on unix and cygwin on windows
8
9		my ($name) = @_;
10
11		unlink($name); unlink("$name.exe");
12
13		}
14
15		sub BadPCLMUL {
16		print "\n\nPCLMUL does not work on this system\n";
17		print "reconfiguring with NTL_PCLMUL=off...\n\n\n";
18		system("echo 'NTL_PCLMUL=off' > RETRY_CONFIG");
19		exit 1;
20		}
21	5
22	6	%MakeFlag = (
23	7
24	8	'WIZARD' => 'on',
25	9	'SHARED' => 'off',
	10	'NATIVE' => 'on'
26	11
27	12	);
28	13

30	15
31	16	'CXX' => 'g++',
32	17	'CXXFLAGS' => '-g -O2',
	18	'CXXAUTOFLAGS'=> '',
33	19	'AR' => 'ar',
34	20	'ARFLAGS' => 'ruv',
35	21	'RANLIB' => 'ranlib',

61	47
62	48	%ConfigFlag = (
63	49
64		'NTL_LEGACY_NO_NAMESPACE' => 'off',
65		'NTL_LEGACY_INPUT_ERROR' => 'off',
	50	'NTL_LEGACY_NO_NAMESPACE' => 'off',
	51	'NTL_LEGACY_INPUT_ERROR' => 'off',
66	52	'NTL_DISABLE_LONGDOUBLE' => 'off',
67		'NTL_DISABLE_LONGLONG' => 'off',
68		'NTL_LEGACY_SP_MULMOD' => 'off',
69		'NTL_THREADS' => 'off',
70		'NTL_EXCEPTIONS' => 'off',
71		'NTL_GMP_LIP' => 'off',
72		'NTL_GF2X_LIB' => 'off',
73		'NTL_PCLMUL' => 'off',
74		'NTL_X86_FIX' => 'off',
75		'NTL_NO_X86_FIX' => 'off',
76		'NTL_AVOID_FLOAT' => 'off',
77		'NTL_LONG_LONG' => 'off',
78		'NTL_SPMM_ULL' => 'off',
79		'NTL_SPMM_ASM' => 'off',
80		'NTL_AVOID_BRANCHING' => 'off',
81		'NTL_TBL_REM' => 'off',
82		'NTL_TBL_REM_LL' => 'off',
83		'NTL_GF2X_NOINLINE' => 'off',
84		'NTL_GF2X_ALTCODE' => 'off',
85		'NTL_GF2X_ALTCODE1' => 'off',
86		'NTL_NO_INIT_TRANS' => 'off',
87		'NTL_CLEAN_INT' => 'off',
88		'NTL_CLEAN_PTR' => 'off',
89		'NTL_RANGE_CHECK' => 'off',
90		'NTL_FFT_BIGTAB' => 'off',
91		'NTL_FFT_LAZYMUL' => 'off',
	53	'NTL_DISABLE_LONGLONG' => 'off',
	54	'NTL_DISABLE_LL_ASM' => 'off',
	55	'NTL_MAXIMIZE_SP_NBITS' => 'off',
	56	'NTL_LEGACY_SP_MULMOD' => 'off',
	57	'NTL_THREADS' => 'off',
	58	'NTL_DISABLE_TLS_HACK' => 'off',
	59	'NTL_ENABLE_TLS_HACK' => 'off',
	60	'NTL_EXCEPTIONS' => 'off',
	61	'NTL_THREAD_BOOST' => 'off',
	62	'NTL_GMP_LIP' => 'on',
	63	'NTL_GF2X_LIB' => 'off',
	64	'NTL_X86_FIX' => 'off',
	65	'NTL_NO_X86_FIX' => 'off',
	66	'NTL_AVOID_FLOAT' => 'off',
	67	'NTL_LONG_LONG' => 'off',
	68	'NTL_SPMM_ULL' => 'off',
	69	'NTL_SPMM_ASM' => 'off',
	70	'NTL_AVOID_BRANCHING' => 'off',
	71	'NTL_TBL_REM' => 'off',
	72	'NTL_TBL_REM_LL' => 'off',
	73	'NTL_CRT_ALTCODE' => 'off',
	74	'NTL_CRT_ALTCODE_SMALL' => 'off',
	75	'NTL_GF2X_NOINLINE' => 'off',
	76	'NTL_GF2X_ALTCODE' => 'off',
	77	'NTL_GF2X_ALTCODE1' => 'off',
	78	'NTL_PCLMUL' => 'off',
	79	'NTL_NO_INIT_TRANS' => 'off',
	80	'NTL_CLEAN_INT' => 'off',
	81	'NTL_CLEAN_PTR' => 'off',
	82	'NTL_RANGE_CHECK' => 'off',
	83	'NTL_FFT_BIGTAB' => 'off',
	84	'NTL_FFT_LAZYMUL' => 'off',
92	85
93	86	);
94	87

109	102
110	103	if ($arg =~ '^(-h\|help\|-help\|--help)$') {
111	104	system("more ../doc/config.txt");
112		exit;
	105	exit 0;
113	106	}
114	107
115	108	if ($arg =~ '^--nowrite$') {

144	137
145	138	}
146	139
147		# special processing for NTL_THREADS: if this is set, we override
148		# the default setting for CXXFLAGS
149
150		if ($ConfigFlag{'NTL_THREADS'} eq 'on' && !exists($Variable{'CXXFLAGS'})) {
151		$MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -std=c++11 -pthread';
152		}
153
154		# special processing for NTL_EXCEPTIONS: similar to processing
155		# for NTL_THREADS
156
157		if ($ConfigFlag{'NTL_EXCEPTIONS'} eq 'on' && $ConfigFlag{'NTL_THREADS'} eq 'off' && !exists($Variable{'CXXFLAGS'})) {
158		$MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -std=c++11';
159		}
160
161
162		# special processing for NTL_PCLMUL: if set, add -mpclmul to CXXFLAGS
163
164		if ($ConfigFlag{'NTL_PCLMUL'} eq 'on' && !exists($Variable{'CXXFLAGS'})) {
165		$MakeVal{'CXXFLAGS'} = $MakeVal{'CXXFLAGS'} . ' -mpclmul';
166		}
	140	# special processing: NTL_THREAD_BOOST => NTL_THREADS
	141
	142	if ($ConfigFlag{'NTL_THREAD_BOOST'} eq 'on') {
	143	$ConfigFlag{'NTL_THREADS'} = 'on';
	144	}
	145
	146	if ($ConfigFlag{'NTL_THREADS'} eq 'on' && $ConfigFlag{'NTL_GMP_LIP'} eq 'off') {
	147	die "Error: NTL_THREADS currently only available with NTL_GMP_LIP...sorry\n";
	148	}
	149
167	150
168	151
169	152	# some special MakeVal values that are determined by SHARED

194	177	$MakeVal{'GMPL'} = '# ';
195	178	$MakeVal{'GMP'} = '# ';
196	179
197		if ($ConfigFlag{'NTL_GMP_LIP'} eq 'on' \|\| $ConfigFlag{'NTL_GMP_HACK'} eq 'on') {
	180	if ($ConfigFlag{'NTL_GMP_LIP'} eq 'on') {
198	181	$MakeVal{'GMP'} = '';
199	182	if (exists($Variable{'DEF_PREFIX'}) \|\|
200	183	exists($Variable{'GMP_PREFIX'}) \|\|

296	279
297	280	}
298	281
299		if ($ConfigSub{'NTL_GMP_HACK'} + $ConfigSub{'NTL_GMP_LIP'} > 1) {
300
301		die "Error: at most one of NTL_GMP_HACK and NTL_GMP_LIP may be on\n";
302
303		}
304	282
305	283	if ($ConfigSub{'NTL_AVOID_FLOAT'} + $ConfigSub{'NTL_LONG_LONG'} > 1) {
306	284

324	302
325	303
326	304
327		######################################
328
329		# all tests pass -- generate files
330
331		######################################
332
333		# generate makefile
334
335		open(MFILE, "< mfile");
336		open(MFILEOUT, "> mfileout");
337
338		while ($line = <MFILE>) {
339
340		$line =~ s/@\{(.*?)\}/$MakeSub{$1}/ge;
341
342		print MFILEOUT $line;
343
344		}
345
346		close(MFILE);
347		close(MFILEOUT);
348
349
350		# generate config.h
351
352		open(CFILE, "< cfile");
353		open(CFILEOUT, "> cfileout");
354
355		while ($line = <CFILE>) {
356
357		$line =~ s/@\{(.*?)\}/$ConfigSub{$1}/ge;
358
359		print CFILEOUT $line;
360
361		}
362
363		close(CFILE);
364		close(CFILEOUT);
365
366		print("CXXFLAGS=\"$MakeVal{'CXXFLAGS'}\"\n");
367
368		if ($nowrite == 0) {
369
370		print("writing makefile\n");
	305
	306	#
	307	#
	308	#code to set CXXAUTOFLAGS
	309
	310	sub RemoveProg {
	311	# This should work on unix and cygwin on windows
	312
	313	my ($name) = @_;
	314	unlink($name); unlink("$name.exe");
	315	return 1;
	316	}
	317
	318	sub GenFiles {
	319
	320	open(MFILE, "< mfile");
	321	open(MFILEOUT, "> mfileout");
	322
	323	while ($line = <MFILE>) {
	324
	325	$line =~ s/@\{(.*?)\}/$MakeSub{$1}/ge;
	326
	327	print MFILEOUT $line;
	328
	329	}
	330
	331	close(MFILE);
	332	close(MFILEOUT);
	333
	334
	335	# generate config.h
	336
	337
	338	open(CFILE, "< cfile");
	339	open(CFILEOUT, "> cfileout");
	340
	341	while ($line = <CFILE>) {
	342
	343	$line =~ s/@\{(.*?)\}/$ConfigSub{$1}/ge;
	344
	345	print CFILEOUT $line;
	346
	347	}
	348
	349	close(CFILE);
	350	close(CFILEOUT);
	351
	352	open(HFILEOUT, "> hfileout");
	353	$argstr = join(' ', @ARGV);
	354	print HFILEOUT "// generated by ./configure $argstr\n";
	355	print HFILEOUT "// CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\" \n";
	356	close(HFILEOUT);
	357
	358
	359	return 1;
	360	}
	361
	362	sub CopyFiles {
	363
371	364	system("cp mfileout makefile");
372
373		print("writing ../include/NTL/config.h\n");
374	365	system("cp cfileout ../include/NTL/config.h");
375
376		if ($ConfigFlag{'NTL_PCLMUL'} eq 'on') {
377		print "\n\nNTL_PCLMUL=on => checking system compatibility...\n\n";
378		RemoveProg("CheckPCLMUL");
379		system("make CheckPCLMUL") and BadPCLMUL();
380		system("./CheckPCLMUL") and BadPCLMUL();
381		print "\n\nPCLMUL works on this system\n";
382		}
383
384		}
385
	366	system("cp hfileout ../include/NTL/config_log.h");
	367
	368	return 1;
	369	}
	370
	371	sub CheckCompile {
	372	GenFiles();
	373	CopyFiles();
	374	RemoveProg("CheckCompile");
	375	system("make CheckCompile >> CheckFlag.log 2>&1") and return 0;
	376	system("./CheckCompile") and RemoveProg("CheckCompile") and return 0;
	377	RemoveProg("CheckCompile");
	378	return 1;
	379	}
	380
	381	sub CheckFlag {
	382	my ($flag) = @_;
	383	my $try_flags = $MakeSub{'CXXAUTOFLAGS'};
	384	print "*** checking $flag flag\n";
	385	$MakeSub{'CXXAUTOFLAGS'} = $MakeSub{'CXXAUTOFLAGS'} . ' ' . $flag;
	386	print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
	387	if (CheckCompile()) {
	388	print "*** $flag works\n";
	389	}
	390	else {
	391	$MakeSub{'CXXAUTOFLAGS'} = $try_flags;
	392	print "*** $flag does not work\n";
	393	}
	394	return 1;
	395	}
	396
	397
	398
	399	if ($nowrite) {
	400	GenFiles();
	401	exit 0;
	402	}
	403
	404
	405	if (exists($Variable{'CXXAUTOFLAGS'})) {
	406	print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
	407	GenFiles();
	408	CopyFiles();
	409	exit 0;
	410	}
	411
	412	$std_flag = 0;
	413	$pthread_flag = 0;
	414	$native_flag = 0;
	415
	416	# special processing for NTL_THREADS
	417
	418	if ($ConfigFlag{'NTL_THREADS'} eq 'on') {
	419	$std_flag = 1; # ' -std=c++11';
	420	$pthread_flag = 1; # ' -pthread';
	421	}
	422
	423	# special processing for NTL_EXCEPTIONS
	424
	425	if ($ConfigFlag{'NTL_EXCEPTIONS'} eq 'on') {
	426	$std_flag = 1; # ' -std=c++11';
	427	}
	428
	429	# special processing for NATIVE
	430
	431	if ($MakeFlag{'NATIVE'} eq 'on') {
	432	$native_flag = 1; # ' -march=native';
	433	}
	434
	435
	436	system("echo '* CheckFlag log *' > CheckFlag.log");
	437
	438	if ($std_flag) {
	439	CheckFlag('-std=c++11');
	440	}
	441
	442	if ($pthread_flag) {
	443	CheckFlag('-pthread');
	444	}
	445
	446	if ($native_flag) {
	447	CheckFlag('-march=native');
	448	}
	449
	450	print("CXXAUTOFLAGS=\"$MakeSub{'CXXAUTOFLAGS'}\"\n");
	451	print("generating makefile\n");
	452	print("generating ../include/NTL/config.h\n");
	453	print("generating ../include/NTL/config_log.h\n");
	454
	455	GenFiles();
	456	CopyFiles();
	457	exit 0;
	458
	459
	460

+60

-23

src/FFT.c less more

249	249
250	250
251	251
	252	// #define NTL_BRC_TEST
	253	// Flag to test the cost of "bit reverse copy"
252	254
253	255
254	256	#define NTL_FFT_BIGTAB_LIMIT (200)
	257	#ifndef NTL_BRC_TEST
255	258	#define NTL_FFT_BIGTAB_MAXROOT (17)
	259	#else
	260	#define NTL_FFT_BIGTAB_MAXROOT (25)
	261	#endif
256	262	// big tables are only used for the first NTL_FFT_BIGTAB_LIMIT primes,
257	263	// and then only for k-values at most NTL_FFT_BIGTAB_MAXROOT
258	264

538	544	}
539	545
540	546
541
542		NTL_THREAD_LOCAL static
543		Vec<long> brc_mem[NTL_FFTMaxRoot+1];
544	547	// FIXME: This could potentially be shared across threads, using
545	548	// a "lazy table".
	549	static inline
	550	Vec<long> *get_brc_mem()
	551	{
	552	NTL_TLS_LOCAL_INIT(Vec< Vec<long> >, brc_mem_vec, (INIT_SIZE, NTL_FFTMaxRoot+1));
	553	return brc_mem_vec.elts();
	554	}
	555
546	556
547	557
548	558	#if 0

551	561	static
552	562	void BitReverseCopy(long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
553	563	{
	564	Vec<long> *brc_mem = get_brc_mem();
	565
554	566	long n = 1L << k;
555	567	long* NTL_RESTRICT rev;
556	568	long i, j;

571	583	static
572	584	void BitReverseCopy(unsigned long * NTL_RESTRICT A, const long * NTL_RESTRICT a, long k)
573	585	{
	586	Vec<long> *brc_mem = get_brc_mem();
	587
574	588	long n = 1L << k;
575	589	long* NTL_RESTRICT rev;
576	590	long i, j;

602	616	static
603	617	long *BRC_init(long k)
604	618	{
	619	Vec<long> *brc_mem = get_brc_mem();
	620
605	621	long n = (1L << k);
606	622	brc_mem[k].SetLength(n);
607	623	long *rev = brc_mem[k].elts();

616	632	void BasicBitReverseCopy(long * NTL_RESTRICT B,
617	633	const long * NTL_RESTRICT A, long k)
618	634	{
	635	Vec<long> *brc_mem = get_brc_mem();
	636
619	637	long n = 1L << k;
620	638	long* NTL_RESTRICT rev;
621	639	long i, j;

632	650	static
633	651	void COBRA(long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
634	652	{
635		NTL_THREAD_LOCAL static Vec<long> BRC_temp;
	653	Vec<long> *brc_mem = get_brc_mem();
	654
	655	NTL_TLS_LOCAL(Vec<long>, BRC_temp);
636	656
637	657	long q = NTL_BRC_Q;
638	658	long k1 = k - 2*q;

683	703	void BasicBitReverseCopy(unsigned long * NTL_RESTRICT B,
684	704	const long * NTL_RESTRICT A, long k)
685	705	{
	706	Vec<long> *brc_mem = get_brc_mem();
	707
686	708	long n = 1L << k;
687	709	long* NTL_RESTRICT rev;
688	710	long i, j;

699	721	static
700	722	void COBRA(unsigned long * NTL_RESTRICT B, const long * NTL_RESTRICT A, long k)
701	723	{
702		NTL_THREAD_LOCAL static Vec<unsigned long> BRC_temp;
	724	Vec<long> *brc_mem = get_brc_mem();
	725
	726	NTL_TLS_LOCAL(Vec<unsigned long>, BRC_temp);
703	727
704	728	long q = NTL_BRC_Q;
705	729	long k1 = k - 2*q;

807	831
808	832	// assume k > 1
809	833
810		NTL_THREAD_LOCAL static Vec<long> wtab_store;
811		NTL_THREAD_LOCAL static Vec<mulmod_precon_t> wqinvtab_store;
812		NTL_THREAD_LOCAL static Vec<long> AA_store;
	834	NTL_TLS_LOCAL(Vec<long>, wtab_store);
	835	NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
	836	NTL_TLS_LOCAL(Vec<long>, AA_store);
813	837
814	838	wtab_store.SetLength(1L << (k-2));
815	839	wqinvtab_store.SetLength(1L << (k-2));

1041	1065	static inline unsigned long
1042	1066	sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
1043	1067	{
1044		NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << (NTL_SP_NBITS+2);
1045		unsigned long H = ((U << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2)) >> NTL_BITS_PER_LONG);
	1068	unsigned long H = cast_unsigned(b);
1046	1069	unsigned long Q = MulHiUL(H << 4, ninv);
1047		unsigned long L = U;
	1070	unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
1048	1071	long r = L - Qcast_unsigned(n); // r in [0..2n)
1049	1072
1050	1073	r = sp_CorrectExcessQuo(Q, r, n);

1055	1078	static inline unsigned long
1056	1079	sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
1057	1080	{
1058		NTL_ULL_TYPE U = ((NTL_ULL_TYPE) cast_unsigned(b)) << (NTL_SP_NBITS+2);
1059		unsigned long H = ((U << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2)) >> NTL_BITS_PER_LONG);
	1081	unsigned long H = cast_unsigned(b);
1060	1082	unsigned long Q = MulHiUL(H << 4, ninv);
1061		unsigned long L = U;
	1083	unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
1062	1084	long r = L - Qcast_unsigned(n); // r in [0..2n)
1063	1085
1064	1086	Q += 1L + sp_SignMask(r-n);
1065	1087	return Q; // NOTE: not shifted
1066	1088	}
	1089
1067	1090
1068	1091	#else
1069	1092

1257	1280
1258	1281
1259	1282
	1283	// FFT: Lazy, no tables
1260	1284
1261	1285	void NTL_FFT_ROUTINE_NOTAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1262	1286

1283	1307
1284	1308	// assume k >= 2
1285	1309
1286		NTL_THREAD_LOCAL static Vec<unsigned long> AA_store;
	1310	NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
1287	1311	AA_store.SetLength(1L << k);
1288	1312	unsigned long *AA = AA_store.elts();
1289	1313
1290		NTL_THREAD_LOCAL static Vec<long> wtab_store;
	1314	NTL_TLS_LOCAL(Vec<long>, wtab_store);
1291	1315	wtab_store.SetLength(max(2, 1L << (k-2)));
1292	1316	// allocate space for at least 2 elements, to deal with a corner case when k == 2
1293	1317	long * NTL_RESTRICT wtab = wtab_store.elts();
1294	1318
1295		NTL_THREAD_LOCAL static Vec<mulmod_precon_t> wqinvtab_store;
	1319	NTL_TLS_LOCAL(Vec<mulmod_precon_t>, wqinvtab_store);
1296	1320	wqinvtab_store.SetLength(max(2, 1L << (k-2)));
1297	1321	// allocate space for at least 2 elements, to deal with a corner case when k == 2
1298	1322	mulmod_precon_t * NTL_RESTRICT wqinvtab = wqinvtab_store.elts();

1683	1707	}
1684	1708
1685	1709
	1710	// FFT: no lazy, table
1686	1711
1687	1712	void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
1688	1713	// performs a 2^k-point convolution modulo q

1718	1743
1719	1744	if (k >= tab.length()) PrecompFFTMultipliers(k, q, qinv, root, tab);
1720	1745
1721		NTL_THREAD_LOCAL static Vec<long> AA_store;
	1746	NTL_TLS_LOCAL(Vec<long>, AA_store);
1722	1747	AA_store.SetLength(1L << k);
1723	1748	long *AA = AA_store.elts();
1724	1749

2023	2048
2024	2049
2025	2050
2026
	2051	#ifdef NTL_BRC_TEST
	2052	bool BRC_test_flag = false;
	2053	#endif
	2054
	2055
	2056	// FFT: lazy, tables
2027	2057
2028	2058	void NTL_FFT_ROUTINE_TAB(long* A, const long* a, long k, const FFTPrimeInfo& info, long dir)
2029	2059

2058	2088
2059	2089	if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2060	2090
2061		NTL_THREAD_LOCAL static Vec<unsigned long> AA_store;
	2091	NTL_TLS_LOCAL(Vec<unsigned long>, AA_store);
2062	2092	AA_store.SetLength(1L << k);
2063	2093	unsigned long *AA = AA_store.elts();
2064	2094
2065	2095
2066
	2096	long n = 1L << k;
	2097
	2098	#ifndef NTL_BRC_TEST
2067	2099	BitReverseCopy(AA, a, k);
2068
2069		long n = 1L << k;
	2100	#else
	2101	if (BRC_test_flag)
	2102	for (long i = 0; i < n; i++) AA[i] = a[i];
	2103	else
	2104	BitReverseCopy(AA, a, k);
	2105	#endif
	2106
2070	2107
2071	2108
2072	2109	/* we work with redundant representations, in the range [0, 4q) */

+11

-6

src/GF2E.c less more

4	4	#include <NTL/new.h>
5	5
6	6	NTL_START_IMPL
	7
	8	NTL_TLS_GLOBAL_DECL(SmartPtr<GF2EInfoT>, GF2EInfo_stg)
	9
	10	NTL_CHEAP_THREAD_LOCAL
	11	GF2EInfoT *GF2EInfo = 0;
7	12
8	13
9	14	GF2EInfoT::GF2EInfoT(const GF2X& NewP)

79	84
80	85
81	86
82		NTL_THREAD_LOCAL
83		SmartPtr<GF2EInfoT> GF2EInfo = 0;
84
85	87
86	88
87	89

94	96
95	97	void GF2EContext::save()
96	98	{
97		ptr = GF2EInfo;
	99	NTL_TLS_GLOBAL_ACCESS(GF2EInfo_stg);
	100	ptr = GF2EInfo_stg;
98	101	}
99	102
100	103	void GF2EContext::restore() const
101	104	{
102		GF2EInfo = ptr;
	105	NTL_TLS_GLOBAL_ACCESS(GF2EInfo_stg);
	106	GF2EInfo_stg = ptr;
	107	GF2EInfo = GF2EInfo_stg.get();
103	108	}
104	109
105	110

126	131
127	132	const GF2E& GF2E::zero()
128	133	{
129		NTL_THREAD_LOCAL static GF2E z(INIT_NO_ALLOC);
	134	static const GF2E z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
130	135	return z;
131	136	}
132	137

-2

src/GF2EX.c less more

11	11
12	12	const GF2EX& GF2EX::zero()
13	13	{
14		NTL_THREAD_LOCAL static GF2EX z;
	14	static const GF2EX z; // GLOBAL (assumes C++11 thread-safe init)
15	15	return z;
16	16	}
17	17

2460	2460
2461	2461
2462	2462
2463		NTL_THREAD_LOCAL
	2463	NTL_CHEAP_THREAD_LOCAL
2464	2464	long GF2EXArgBound = 0;
2465	2465
2466	2466

-6

src/GF2EXFactoring.c less more

841	841	}
842	842
843	843
844		NTL_THREAD_LOCAL
	844	NTL_CHEAP_THREAD_LOCAL
845	845	long GF2EX_BlockingFactor = 10;
846	846
847	847	void DDF(vec_pair_GF2EX_long& factors, const GF2EX& ff, const GF2EX& hh,

1629	1629
1630	1630	/*********** NEW DDF **************/
1631	1631
1632		NTL_THREAD_LOCAL long GF2EX_GCDTableSize = 4;
1633		NTL_THREAD_LOCAL double GF2EXFileThresh = NTL_FILE_THRESH;
1634		NTL_THREAD_LOCAL static vec_GF2EX *BabyStepFile = 0;
1635		NTL_THREAD_LOCAL static vec_GF2EX *GiantStepFile = 0;
1636		NTL_THREAD_LOCAL static long use_files;
	1632	NTL_CHEAP_THREAD_LOCAL long GF2EX_GCDTableSize = 4;
	1633	NTL_CHEAP_THREAD_LOCAL double GF2EXFileThresh = NTL_FILE_THRESH;
	1634	static NTL_CHEAP_THREAD_LOCAL vec_GF2EX *BabyStepFile = 0;
	1635	static NTL_CHEAP_THREAD_LOCAL vec_GF2EX *GiantStepFile = 0;
	1636	static NTL_CHEAP_THREAD_LOCAL long use_files;
1637	1637
1638	1638
1639	1639	static

-6

src/GF2X.c less more

39	39
40	40	NTL_START_IMPL
41	41
42		NTL_THREAD_LOCAL
	42	NTL_CHEAP_THREAD_LOCAL
43	43	long GF2X::HexOutput = 0;
44	44
45	45

61	61
62	62	const GF2X& GF2X::zero()
63	63	{
64		NTL_THREAD_LOCAL static GF2X z;
	64	static const GF2X z; // GLOBAL (assumes C++11 thread-safe init)
65	65	return z;
66	66	}
67	67

1040	1040	// finally: the general case
1041	1041
1042	1042
1043		NTL_THREAD_LOCAL static WordVector mem;
	1043	NTL_TLS_LOCAL(WordVector, mem);
1044	1044	WordVectorWatcher watch_mem(mem);
1045	1045
1046	1046	const _ntl_ulong ap = a.xrep.elts(), bp = b.xrep.elts();

1406	1406	// finally: the general case
1407	1407
1408	1408
1409		NTL_THREAD_LOCAL static WordVector mem;
1410		NTL_THREAD_LOCAL static WordVector stk;
1411		NTL_THREAD_LOCAL static WordVector vec;
	1409	NTL_TLS_LOCAL(WordVector, mem);
	1410	NTL_TLS_LOCAL(WordVector, stk);
	1411	NTL_TLS_LOCAL(WordVector, vec);
1412	1412
1413	1413	WordVectorWatcher watch_mem(mem);
1414	1414	WordVectorWatcher watch_stk(stk);

+47

-40

src/GF2X1.c less more

26	26
27	27
28	28
29		#define NTL_GF2X_GCD_CROSSOVER (XOVER_SCALE400LNTL_BITS_PER_LONG)
	29	#define NTL_GF2X_GCD_CROSSOVER (XOVER_SCALE300LNTL_BITS_PER_LONG)
	30
30	31	#define NTL_GF2X_BERMASS_CROSSOVER (XOVER_SCALE200LNTL_BITS_PER_LONG)
31	32
32		#define NTL_GF2X_HalfGCD_CROSSOVER (6L*NTL_BITS_PER_LONG)
	33	#define NTL_GF2X_HalfGCD_CROSSOVER (4L*NTL_BITS_PER_LONG)
33	34
34	35
35	36

37	38
38	39
39	40
40		NTL_THREAD_LOCAL
41		static vec_GF2X stab; // used by PlainDivRem and PlainRem
42
43		NTL_THREAD_LOCAL
44		static WordVector GF2X_rembuf;
	41	NTL_TLS_GLOBAL_DECL(vec_GF2X, stab)
	42	// used by PlainDivRem and PlainRem
	43
	44	NTL_TLS_GLOBAL_DECL(WordVector, GF2X_rembuf)
45	45
46	46
47	47	void PlainDivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
48	48	{
	49	NTL_TLS_GLOBAL_ACCESS(stab);
	50
49	51	long da, sa, posa, db, sb, posb, dq, sq, posq;
50	52
51	53	da = deg(a);

68	70	sq = dq/NTL_BITS_PER_LONG + 1;
69	71	posq = dq - NTL_BITS_PER_LONG*(sq-1);
70	72
	73	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
71	74	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
72	75
73	76	_ntl_ulong *ap;

164	167
165	168	void PlainRem(GF2X& r, const GF2X& a, const GF2X& b)
166	169	{
	170	NTL_TLS_GLOBAL_ACCESS(stab);
	171
167	172	long da, sa, posa, db, sb, posb;
168	173
169	174	da = deg(a);

182	187	posb = db - NTL_BITS_PER_LONG*(sb-1);
183	188
184	189
	190	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
185	191	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
186	192
187	193	_ntl_ulong *ap;

281	287	return;
282	288	}
283	289
284		NTL_THREAD_LOCAL static vec_long E;
	290	NTL_TLS_LOCAL(vec_long, E);
285	291	E.SetLength(0);
286	292	append(E, e);
287	293	while (e > 8) {

462	468
463	469	GF2X f0;
464	470	trunc(f0, f, n);
465		long deg_f0 = deg(f0);
466
467		if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG
468		&& deg_f0 >= NTL_BITS_PER_LONG/2) {
469		if (F.size >= 3*XOVER_SCALE)
470		F.method = GF2X_MOD_MUL;
471		else
472		F.method = GF2X_MOD_SPECIAL;
473		}
474		else if (F.sn > 1 && deg_f0 < NTL_BITS_PER_LONG/2) {
475		if (F.size >= 2*XOVER_SCALE)
476		F.method = GF2X_MOD_MUL;
477		else
478		F.method = GF2X_MOD_SPECIAL;
479		}
480		else if (F.size >= 4*XOVER_SCALE)
	471
	472	if (F.n >= (NTL_BITS_PER_LONG/2)*XOVER_SCALE)
481	473	F.method = GF2X_MOD_MUL;
482	474	else
483	475	F.method = GF2X_MOD_PLAIN;
	476
	477
	478	// NOTE: I've run some tests which indicate that the GF2X_MOD_SPECIAL
	479	// method is not worth it.
	480	// FIXME: in a future version, I should eliminate all code
	481	// and data associated with GF2X_MOD_SPECIAL
	482
	483	// NOTE: I've runs some tests which indicate that the crossover
	484	// for GF2X_MOD_MUL is extremely low, even without PCLMUL support.
484	485
485	486
486	487	if (F.method == GF2X_MOD_SPECIAL) {

1280	1281	UseMulRemX1(r, a, F);
1281	1282	}
1282	1283	else if (F.method == GF2X_MOD_SPECIAL) {
	1284	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1283	1285	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1284	1286
1285	1287	long sa = a.xrep.length();

1327	1329	r.normalize();
1328	1330	}
1329	1331	else {
	1332	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1330	1333	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1331	1334
1332	1335	long sa = a.xrep.length();

1404	1407	UseMulDivRemX1(q, r, a, F);
1405	1408	}
1406	1409	else if (F.method == GF2X_MOD_SPECIAL) {
	1410	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1407	1411	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1408	1412
1409	1413	long sa = a.xrep.length();

1470	1474	r.normalize();
1471	1475	}
1472	1476	else {
	1477	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1473	1478	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1474	1479
1475	1480	long sa = a.xrep.length();

1566	1571	UseMulDivX1(q, a, F);
1567	1572	}
1568	1573	else if (F.method == GF2X_MOD_SPECIAL) {
	1574	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1569	1575	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1570	1576
1571	1577	long sa = a.xrep.length();

1618	1624	}
1619	1625	}
1620	1626	else {
	1627	NTL_TLS_GLOBAL_ACCESS(GF2X_rembuf);
1621	1628	WordVectorWatcher watch_GF2X_rembuf(GF2X_rembuf);
1622	1629
1623	1630	long sa = a.xrep.length();

1946	1953	}
1947	1954
1948	1955
1949		const long GF2X_DIV_CROSS = 40*XOVER_SCALE;
	1956	const long GF2X_DIV_CROSS = (NTL_BITS_PER_LONG/2)*XOVER_SCALE;
1950	1957
1951	1958	void DivRem(GF2X& q, GF2X& r, const GF2X& a, const GF2X& b)
1952	1959	{
1953		long sa = a.xrep.length();
1954		long sb = b.xrep.length();
1955
1956		if (sb < GF2X_DIV_CROSS \|\| sa-sb < GF2X_DIV_CROSS)
	1960	long da = deg(a);
	1961	long db = deg(b);
	1962
	1963	if (db < GF2X_DIV_CROSS \|\| da-db < GF2X_DIV_CROSS)
1957	1964	PlainDivRem(q, r, a, b);
1958		else if (sa < 4*sb)
	1965	else if (da < 4*db)
1959	1966	UseMulDivRem(q, r, a, b);
1960	1967	else {
1961	1968	GF2XModulus B;

1966	1973
1967	1974	void div(GF2X& q, const GF2X& a, const GF2X& b)
1968	1975	{
1969		long sa = a.xrep.length();
1970		long sb = b.xrep.length();
1971
1972		if (sb < GF2X_DIV_CROSS \|\| sa-sb < GF2X_DIV_CROSS)
	1976	long da = deg(a);
	1977	long db = deg(b);
	1978
	1979	if (db < GF2X_DIV_CROSS \|\| da-db < GF2X_DIV_CROSS)
1973	1980	PlainDiv(q, a, b);
1974		else if (sa < 4*sb)
	1981	else if (da < 4*db)
1975	1982	UseMulDiv(q, a, b);
1976	1983	else {
1977	1984	GF2XModulus B;

1982	1989
1983	1990	void rem(GF2X& r, const GF2X& a, const GF2X& b)
1984	1991	{
1985		long sa = a.xrep.length();
1986		long sb = b.xrep.length();
1987
1988		if (sb < GF2X_DIV_CROSS \|\| sa-sb < GF2X_DIV_CROSS)
	1992	long da = deg(a);
	1993	long db = deg(b);
	1994
	1995	if (db < GF2X_DIV_CROSS \|\| da-db < GF2X_DIV_CROSS)
1989	1996	PlainRem(r, a, b);
1990		else if (sa < 4*sb)
	1997	else if (da < 4*db)
1991	1998	UseMulRem(r, a, b);
1992	1999	else {
1993	2000	GF2XModulus B;

-1

src/GF2XTimeTest.c less more

52	52	printf("NTL_GF2X_NOINLINE ");
53	53	#endif
54	54
	55	#ifdef NTL_PCLMUL
	56	printf("NTL_PCLMUL ");
	57	#endif
	58
55	59
56	60	printf("\n");
57	61

61	65	{
62	66	long n, i, j, iter, s, k;
63	67	double t;
	68
	69	SetSeed(ZZ(0));
64	70
65	71
66	72	for (i = 0; i < 10000; i++) {

108	114
109	115	iter = iter/2;
110	116
111		iter = long((2/t)*iter) + 1;
	117	iter = long((3/t)*iter) + 1;
112	118
113	119	double tvec[5];
114	120	long w;

+15

-8

src/G_LLL_FP.c less more

562	562	CheckFinite(&p[i]);
563	563	}
564	564
565		NTL_THREAD_LOCAL static double red_fudge = 0;
566		NTL_THREAD_LOCAL static long log_red = 0;
567		NTL_THREAD_LOCAL static long verbose = 0;
568		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
569		NTL_THREAD_LOCAL static double StartTime = 0;
570		NTL_THREAD_LOCAL static double LastTime = 0;
	565	static NTL_CHEAP_THREAD_LOCAL double red_fudge = 0;
	566	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
	567	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	568	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	569	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	570	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
571	571
572	572
573	573

973	973
974	974
975	975
976		NTL_THREAD_LOCAL static vec_double G_BKZConstant;
	976	NTL_TLS_GLOBAL_DECL(vec_double, G_BKZConstant)
977	977
978	978	static
979	979	void ComputeG_BKZConstant(long beta, long p)
980	980	{
	981	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	982
981	983	const double c_PI = 3.14159265358979323846264338328;
982	984	const double LogPI = 1.14472988584940017414342735135;
983	985

1028	1030	}
1029	1031	}
1030	1032
1031		NTL_THREAD_LOCAL static vec_double G_BKZThresh;
	1033	NTL_TLS_GLOBAL_DECL(vec_double, G_BKZThresh)
1032	1034
1033	1035	static
1034	1036	void ComputeG_BKZThresh(double *c, long beta)
1035	1037	{
	1038	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	1039	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	1040
1036	1041	G_BKZThresh.SetLength(beta-1);
1037	1042
1038	1043	long i;

1107	1112	long G_BKZ_FP(mat_ZZ& BB, mat_ZZ* UU, double delta,
1108	1113	long beta, long prune, LLLCheckFct check)
1109	1114	{
	1115	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	1116
1110	1117
1111	1118
1112	1119

+29

-9

src/G_LLL_QP.c less more

575	575	CheckFinite(&p[i]);
576	576	}
577	577
578		NTL_THREAD_LOCAL static quad_float red_fudge = to_quad_float(0);
579		NTL_THREAD_LOCAL static long log_red = 0;
580		NTL_THREAD_LOCAL static long verbose = 0;
581		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
582		NTL_THREAD_LOCAL static double StartTime = 0;
583		NTL_THREAD_LOCAL static double LastTime = 0;
	578	NTL_TLS_GLOBAL_DECL_INIT(quad_float, red_fudge, (to_quad_float(0)))
	579
	580	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
	581	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	582	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	583	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	584	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
584	585
585	586
586	587

629	630
630	631	static void init_red_fudge()
631	632	{
	633	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	634
632	635	long i;
633	636
634	637	// initial log_red should be <= NTL_DOUBLE_PRECISION-2,

644	647
645	648	static void inc_red_fudge()
646	649	{
	650	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	651
647	652
648	653	red_fudge = red_fudge * 2;
649	654	log_red--;

661	666	quad_float **aux,
662	667	long m, long init_k, long &quit, GivensCache_QP& cache)
663	668	{
	669	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	670
664	671	long n = B.NumCols();
665	672
666	673	long i, j, k, Fc1;

965	972
966	973
967	974
968		NTL_THREAD_LOCAL static vec_quad_float G_BKZConstant;
	975	NTL_TLS_GLOBAL_DECL(vec_quad_float, G_BKZConstant)
969	976
970	977	static
971	978	void ComputeG_BKZConstant(long beta, long p)
972	979	{
	980	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	981
973	982	const quad_float c_PI =
974	983	to_quad_float("3.141592653589793238462643383279502884197");
975	984	const quad_float LogPI =

1022	1031	}
1023	1032	}
1024	1033
1025		NTL_THREAD_LOCAL static vec_quad_float G_BKZThresh;
	1034	NTL_TLS_GLOBAL_DECL(vec_quad_float, G_BKZThresh)
1026	1035
1027	1036	static
1028	1037	void ComputeG_BKZThresh(quad_float *c, long beta)
1029		{
	1038	{
	1039	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	1040	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	1041
1030	1042	G_BKZThresh.SetLength(beta-1);
1031	1043
1032	1044	long i;

1101	1113	long G_BKZ_QP(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
1102	1114	long beta, long prune, LLLCheckFct check)
1103	1115	{
	1116	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	1117	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	1118
	1119
1104	1120
1105	1121	long m = BB.NumRows();
1106	1122	long n = BB.NumCols();

1569	1585	long G_BKZ_QP1(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
1570	1586	long beta, long prune, LLLCheckFct check)
1571	1587	{
	1588	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	1589	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	1590
	1591
1572	1592	long m = BB.NumRows();
1573	1593	long n = BB.NumCols();
1574	1594	long m_orig = m;

+24

-8

src/G_LLL_RR.c less more

380	380
381	381	}
382	382
383		NTL_THREAD_LOCAL static RR red_fudge;
384		NTL_THREAD_LOCAL static long log_red = 0;
	383	NTL_TLS_GLOBAL_DECL(RR, red_fudge)
	384
	385	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
385	386
386	387	static void init_red_fudge()
387	388	{
	389	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	390
388	391	log_red = long(0.50*RR::precision());
389	392
390	393	power2(red_fudge, -log_red);

392	395
393	396	static void inc_red_fudge()
394	397	{
	398	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	399
395	400
396	401	mul(red_fudge, red_fudge, 2);
397	402	log_red--;

405	410
406	411
407	412
408		NTL_THREAD_LOCAL static long verbose = 0;
409		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
410		NTL_THREAD_LOCAL static double StartTime = 0;
411		NTL_THREAD_LOCAL static double LastTime = 0;
	413	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	414	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	415	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	416	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
412	417
413	418
414	419

462	467	mat_RR& aux, long m, long init_k, long &quit,
463	468	GivensCache_RR& cache)
464	469	{
	470	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	471
465	472	long n = B.NumCols();
466	473
467	474	long i, j, k, Fc1;

745	752
746	753
747	754
748		NTL_THREAD_LOCAL static vec_RR G_BKZConstant;
	755	NTL_TLS_GLOBAL_DECL(vec_RR, G_BKZConstant)
749	756
750	757	static
751	758	void ComputeG_BKZConstant(long beta, long p)
752	759	{
	760	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	761
753	762	RR c_PI;
754	763	ComputePi(c_PI);
755	764

799	808
800	809	}
801	810
802		NTL_THREAD_LOCAL static vec_RR G_BKZThresh;
	811	NTL_TLS_GLOBAL_DECL(vec_RR, G_BKZThresh)
803	812
804	813	static
805	814	void ComputeG_BKZThresh(RR *c, long beta)
806	815	{
	816	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	817	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	818
807	819	G_BKZThresh.SetLength(beta-1);
808	820
809	821	long i;

885	897	long G_BKZ_RR(mat_ZZ& BB, mat_ZZ* UU, const RR& delta,
886	898	long beta, long prune, LLLCheckFct check)
887	899	{
	900	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	901	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	902
	903
888	904	long m = BB.NumRows();
889	905	long n = BB.NumCols();
890	906	long m_orig = m;

+24

-8

src/G_LLL_XD.c less more

348	348	if (k > n) p[k] = 0;
349	349	}
350	350
351		NTL_THREAD_LOCAL static xdouble red_fudge = to_xdouble(0);
352		NTL_THREAD_LOCAL static long log_red = 0;
	351	NTL_TLS_GLOBAL_DECL_INIT(xdouble, red_fudge, (to_xdouble(0)))
	352
	353	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
353	354
354	355	static void init_red_fudge()
355	356	{
	357	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	358
356	359	long i;
357	360
358	361	log_red = long(0.50*NTL_DOUBLE_PRECISION);

364	367
365	368	static void inc_red_fudge()
366	369	{
	370	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	371
367	372
368	373	red_fudge = red_fudge * 2;
369	374	log_red--;

376	381
377	382
378	383
379		NTL_THREAD_LOCAL static long verbose = 0;
380		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
381		NTL_THREAD_LOCAL static double StartTime = 0;
382		NTL_THREAD_LOCAL static double LastTime = 0;
	384	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	385	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	386	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	387	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
383	388
384	389
385	390

432	437	xdouble **aux,
433	438	long m, long init_k, long &quit, GivensCache_XD& cache)
434	439	{
	440	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	441
435	442	long n = B.NumCols();
436	443
437	444	long i, j, k, Fc1;

713	720
714	721
715	722
716		NTL_THREAD_LOCAL static vec_xdouble G_BKZConstant;
	723	NTL_TLS_GLOBAL_DECL(vec_xdouble, G_BKZConstant)
717	724
718	725	static
719	726	void ComputeG_BKZConstant(long beta, long p)
720	727	{
	728	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	729
721	730	const double c_PI = 3.14159265358979323846264338328;
722	731	const double LogPI = 1.14472988584940017414342735135;
723	732

768	777	}
769	778	}
770	779
771		NTL_THREAD_LOCAL static vec_xdouble G_BKZThresh;
	780	NTL_TLS_GLOBAL_DECL(vec_xdouble, G_BKZThresh)
772	781
773	782	static
774	783	void ComputeG_BKZThresh(xdouble *c, long beta)
775	784	{
	785	NTL_TLS_GLOBAL_ACCESS(G_BKZConstant);
	786	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	787
776	788	G_BKZThresh.SetLength(beta-1);
777	789
778	790	long i;

846	858	long G_BKZ_XD(mat_ZZ& BB, mat_ZZ* UU, xdouble delta,
847	859	long beta, long prune, LLLCheckFct check)
848	860	{
	861	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	862	NTL_TLS_GLOBAL_ACCESS(G_BKZThresh);
	863
	864
849	865	long m = BB.NumRows();
850	866	long n = BB.NumCols();
851	867	long m_orig = m;

+126

-0

src/GetTime0.c less more

	0
	1	/*
	2	* Author: David Robert Nadeau
	3	* Site: http://NadeauSoftware.com/
	4	* License: Creative Commons Attribution 3.0 Unported License
	5	* http://creativecommons.org/licenses/by/3.0/deed.en_US
	6	*/
	7
	8
	9	// NTL NOTES: I've adapted this code from the above source.
	10	// The reason is that for some multithreaded benchmarking, I want
	11	// to use wall clock time, and this seemed like the best multiplatform
	12	// solution to getting a high-resolution wall clock timer.
	13	// The only change I made to the original code is to initialize
	14	// timeConvert for the OSX case using a thread-safe initialization
	15	// C++ idiom.
	16
	17
	18
	19	#if defined(_WIN32)
	20	#include <Windows.h>
	21
	22	#elif defined(__unix__) \|\| defined(__unix) \|\| defined(unix) \|\| (defined(__APPLE__) && defined(__MACH__))
	23	#include <unistd.h> /* POSIX flags */
	24	#include <time.h> /* clock_gettime(), time() */
	25	#include <sys/time.h> /* gethrtime(), gettimeofday() */
	26
	27	#if defined(__MACH__) && defined(__APPLE__)
	28	#include <mach/mach.h>
	29	#include <mach/mach_time.h>
	30
	31	static inline double InitTimeConvert()
	32	{
	33	mach_timebase_info_data_t timeBase;
	34	(void)mach_timebase_info( &timeBase );
	35	return (double)timeBase.numer / (double)timeBase.denom / 1000000000.0;
	36	}
	37
	38	#endif
	39
	40	#else
	41	#error "Unable to define GetTime( ) for an unknown OS."
	42	#endif
	43
	44
	45
	46
	47
	48
	49	/**
	50	* Returns the real time, in seconds, or -1.0 if an error occurred.
	51	*
	52	* Time is measured since an arbitrary and OS-dependent start time.
	53	* The returned real time is only useful for computing an elapsed time
	54	* between two calls to this function.
	55	*/
	56	double _ntl_GetTime( )
	57	{
	58	#if defined(_WIN32)
	59	FILETIME tm;
	60	ULONGLONG t;
	61	#if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
	62	/* Windows 8, Windows Server 2012 and later. ---------------- */
	63	GetSystemTimePreciseAsFileTime( &tm );
	64	#else
	65	/* Windows 2000 and later. ---------------------------------- */
	66	GetSystemTimeAsFileTime( &tm );
	67	#endif
	68	t = ((ULONGLONG)tm.dwHighDateTime << 32) \| (ULONGLONG)tm.dwLowDateTime;
	69	return (double)t / 10000000.0;
	70
	71	#elif (defined(__hpux) \|\| defined(hpux)) \|\| ((defined(__sun__) \|\| defined(__sun) \|\| defined(sun)) && (defined(__SVR4) \|\| defined(__svr4__)))
	72	/* HP-UX, Solaris. ------------------------------------------ */
	73	return (double)gethrtime( ) / 1000000000.0;
	74
	75	#elif defined(__MACH__) && defined(__APPLE__)
	76	/* OSX. ----------------------------------------------------- */
	77	static double timeConvert = InitTimeConvert();
	78	// even in a multi-threaded environment, this will
	79	// be safely initialized, according to C++11 standard
	80
	81	return (double)mach_absolute_time( ) * timeConvert;
	82
	83	#elif defined(_POSIX_VERSION)
	84	/* POSIX. --------------------------------------------------- */
	85	#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
	86	{
	87	struct timespec ts;
	88	#if defined(CLOCK_MONOTONIC_PRECISE)
	89	/* BSD. --------------------------------------------- */
	90	const clockid_t id = CLOCK_MONOTONIC_PRECISE;
	91	#elif defined(CLOCK_MONOTONIC_RAW)
	92	/* Linux. ------------------------------------------- */
	93	const clockid_t id = CLOCK_MONOTONIC_RAW;
	94	#elif defined(CLOCK_HIGHRES)
	95	/* Solaris. ----------------------------------------- */
	96	const clockid_t id = CLOCK_HIGHRES;
	97	#elif defined(CLOCK_MONOTONIC)
	98	/* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
	99	const clockid_t id = CLOCK_MONOTONIC;
	100	#elif defined(CLOCK_REALTIME)
	101	/* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
	102	const clockid_t id = CLOCK_REALTIME;
	103	#else
	104	const clockid_t id = (clockid_t)-1; /* Unknown. */
	105	#endif /* CLOCK_* */
	106	if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
	107	return (double)ts.tv_sec +
	108	(double)ts.tv_nsec / 1000000000.0;
	109	/* Fall thru. */
	110	}
	111	#endif /* _POSIX_TIMERS */
	112
	113	/* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
	114	struct timeval tm;
	115	gettimeofday( &tm, NULL );
	116	return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
	117	#else
	118	return -1.0; /* Failed. */
	119	#endif
	120	}
	121
	122
	123
	124
	125

-2

src/GetTime4.c less more

15	15
16	16	double _ntl_GetTime()
17	17	{
18		NTL_THREAD_LOCAL static clock_t last_clock = 0;
19		NTL_THREAD_LOCAL static double acc = 0;
	18	static NTL_CHEAP_THREAD_LOCAL clock_t last_clock = 0;
	19	static NTL_CHEAP_THREAD_LOCAL double acc = 0;
20	20
21	21	clock_t this_clock;
22	22	double delta;

+31

-6

src/InitSettings.c less more

27	27	cout << "NTL_THREADS=0\n";
28	28	#endif
29	29
	30	#ifdef NTL_DISABLE_TLS_HACK
	31	cout << "NTL_DISABLE_TLS_HACK=1\n";
	32	#else
	33	cout << "NTL_DISABLE_TLS_HACK=0\n";
	34	#endif
	35
	36	#ifdef NTL_ENABLE_TLS_HACK
	37	cout << "NTL_ENABLE_TLS_HACK=1\n";
	38	#else
	39	cout << "NTL_ENABLE_TLS_HACK=0\n";
	40	#endif
	41
30	42	#ifdef NTL_EXCEPTIONS
31	43	cout << "NTL_EXCEPTIONS=1\n";
32	44	#else
33	45	cout << "NTL_EXCEPTIONS=0\n";
	46	#endif
	47
	48	#ifdef NTL_THREAD_BOOST
	49	cout << "NTL_THREAD_BOOST=1\n";
	50	#else
	51	cout << "NTL_THREAD_BOOST=0\n";
34	52	#endif
35	53
36	54

55	73	#endif
56	74
57	75
	76	#ifdef NTL_DISABLE_LL_ASM
	77	cout << "NTL_DISABLE_LL_ASM=1\n";
	78	#else
	79	cout << "NTL_DISABLE_LL_ASM=0\n";
	80	#endif
	81
	82	#ifdef NTL_MAXIMIZE_SP_NBITS
	83	cout << "NTL_MAXIMIZE_SP_NBITS=1\n";
	84	#else
	85	cout << "NTL_MAXIMIZE_SP_NBITS=0\n";
	86	#endif
	87
	88
58	89
59	90	#ifdef NTL_GMP_LIP
60	91	cout << "NTL_GMP_LIP=1\n";

67	98	cout << "NTL_GF2X_LIB=1\n";
68	99	#else
69	100	cout << "NTL_GF2X_LIB=0\n";
70		#endif
71
72		#ifdef NTL_PCLMUL
73		cout << "NTL_PCLMUL=1\n";
74		#else
75		cout << "NTL_PCLMUL=0\n";
76	101	#endif
77	102
78	103	#ifdef NTL_LONG_LONG_TYPE

+11

-11

src/LLL_FP.c less more

425	425	c[k] = b[k] - s;
426	426	}
427	427
428		NTL_THREAD_LOCAL static double red_fudge = 0;
429		NTL_THREAD_LOCAL static long log_red = 0;
430		NTL_THREAD_LOCAL static long verbose = 0;
431
432		NTL_THREAD_LOCAL double LLLStatusInterval = 900.0;
433		NTL_THREAD_LOCAL char *LLLDumpFile = 0;
434
435		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
436		NTL_THREAD_LOCAL static double RR_GS_time = 0;
437		NTL_THREAD_LOCAL static double StartTime = 0;
438		NTL_THREAD_LOCAL static double LastTime = 0;
	428	NTL_CHEAP_THREAD_LOCAL double LLLStatusInterval = 900.0;
	429	NTL_CHEAP_THREAD_LOCAL char *LLLDumpFile = 0;
	430
	431	static NTL_CHEAP_THREAD_LOCAL double red_fudge = 0;
	432	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
	433	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	434
	435	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	436	static NTL_CHEAP_THREAD_LOCAL double RR_GS_time = 0;
	437	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	438	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
439	439
440	440
441	441

+30

-9

src/LLL_QP.c less more

429	429	c[k] = b[k] - s;
430	430	}
431	431
432		NTL_THREAD_LOCAL static quad_float red_fudge = to_quad_float(0);
433		NTL_THREAD_LOCAL static long log_red = 0;
434		NTL_THREAD_LOCAL static long verbose = 0;
435		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
436		NTL_THREAD_LOCAL static double StartTime = 0;
437		NTL_THREAD_LOCAL static double LastTime = 0;
	432	NTL_TLS_GLOBAL_DECL_INIT(quad_float, red_fudge, (to_quad_float(0)))
	433
	434	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
	435	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	436	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	437	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	438	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
438	439
439	440
440	441	static void LLLStatus(long max_k, double t, long m, const mat_ZZ& B)

482	483
483	484	static void init_red_fudge()
484	485	{
	486	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	487
485	488	long i;
486	489
487	490	// initial log_red should be <= NTL_DOUBLE_PRECISION-2,

497	500
498	501	static void inc_red_fudge()
499	502	{
	503	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	504
500	505
501	506	red_fudge = red_fudge * 2;
502	507	log_red--;

514	519	quad_float b, quad_float c,
515	520	long m, long init_k, long &quit)
516	521	{
	522	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	523
517	524	long n = B.NumCols();
518	525
519	526	long i, j, k, Fc1;

526	533	quad_float *tp;
527	534
528	535
529		NTL_THREAD_LOCAL static double bound = 0;
	536	static NTL_CHEAP_THREAD_LOCAL double bound = 0;
530	537
531	538
532	539	if (bound == 0) {

883	890
884	891
885	892
886		NTL_THREAD_LOCAL static vec_quad_float BKZConstant;
	893	NTL_TLS_GLOBAL_DECL(vec_quad_float, BKZConstant)
887	894
888	895	static
889	896	void ComputeBKZConstant(long beta, long p)
890	897	{
	898	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	899
	900
891	901	const quad_float c_PI =
892	902	to_quad_float("3.141592653589793238462643383279502884197");
893	903	const quad_float LogPI =

941	951	}
942	952
943	953
944		NTL_THREAD_LOCAL static vec_quad_float BKZThresh;
	954	NTL_TLS_GLOBAL_DECL(vec_quad_float, BKZThresh)
945	955
946	956	static
947	957	void ComputeBKZThresh(quad_float *c, long beta)
948	958	{
	959	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	960	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	961
949	962	BKZThresh.SetLength(beta-1);
950	963
951	964	long i;

1020	1033	long BKZ_QP(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
1021	1034	long beta, long prune, LLLCheckFct check)
1022	1035	{
	1036	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	1037	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	1038
	1039
1023	1040
1024	1041	long m = BB.NumRows();
1025	1042	long n = BB.NumCols();

1494	1511	long BKZ_QP1(mat_ZZ& BB, mat_ZZ* UU, quad_float delta,
1495	1512	long beta, long prune, LLLCheckFct check)
1496	1513	{
	1514	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	1515	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	1516
	1517
1497	1518
1498	1519	long m = BB.NumRows();
1499	1520	long n = BB.NumCols();

+26

-8

src/LLL_RR.c less more

161	161	sub(c(k), b(k), s);
162	162	}
163	163
164		NTL_THREAD_LOCAL static RR red_fudge;
165		NTL_THREAD_LOCAL static long log_red = 0;
	164	NTL_TLS_GLOBAL_DECL(RR, red_fudge)
	165
	166	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
166	167
167	168	static void init_red_fudge()
168	169	{
	170	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	171
169	172	log_red = long(0.50*RR::precision());
170	173
171	174	power2(red_fudge, -log_red);

173	176
174	177	static void inc_red_fudge()
175	178	{
	179	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	180
176	181
177	182	mul(red_fudge, red_fudge, 2);
178	183	log_red--;

186	191
187	192
188	193
189		NTL_THREAD_LOCAL static long verbose = 0;
190		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
191		NTL_THREAD_LOCAL static double StartTime = 0;
192		NTL_THREAD_LOCAL static double LastTime = 0;
	194	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	195	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	196	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	197	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
193	198
194	199
195	200

242	247	LLLCheckFct check, mat_RR& B1, mat_RR& mu,
243	248	vec_RR& b, vec_RR& c, long m, long init_k, long &quit)
244	249	{
	250	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	251
245	252	long n = B.NumCols();
246	253
247	254	long i, j, k, Fc1;

587	594
588	595
589	596
590		NTL_THREAD_LOCAL static vec_RR BKZConstant;
	597	NTL_TLS_GLOBAL_DECL(vec_RR, BKZConstant)
591	598
592	599	static
593	600	void ComputeBKZConstant(long beta, long p)
594	601	{
	602	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	603
595	604	RR c_PI;
596	605	ComputePi(c_PI);
597	606

641	650
642	651	}
643	652
644		NTL_THREAD_LOCAL static vec_RR BKZThresh;
	653	NTL_TLS_GLOBAL_DECL(vec_RR, BKZThresh)
645	654
646	655	static
647	656	void ComputeBKZThresh(RR *c, long beta)
648	657	{
	658	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	659	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	660
649	661	BKZThresh.SetLength(beta-1);
650	662
651	663	long i;

727	739	long BKZ_RR(mat_ZZ& BB, mat_ZZ* UU, const RR& delta,
728	740	long beta, long prune, LLLCheckFct check)
729	741	{
	742	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	743	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	744
	745
730	746	long m = BB.NumRows();
731	747	long n = BB.NumCols();
732	748	long m_orig = m;

1194	1210
1195	1211	void NearVector(vec_ZZ& ww, const mat_ZZ& BB, const vec_ZZ& a)
1196	1212	{
	1213	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	1214
1197	1215	long n = BB.NumCols();
1198	1216
1199	1217	if (n != BB.NumRows())

+26

-9

src/LLL_XD.c less more

189	189	c[k] = b[k] - s;
190	190	}
191	191
192		NTL_THREAD_LOCAL static xdouble red_fudge = to_xdouble(0);
193		NTL_THREAD_LOCAL static long log_red = 0;
	192	NTL_TLS_GLOBAL_DECL_INIT(xdouble, red_fudge, (to_xdouble(0)))
	193
	194
	195	static NTL_CHEAP_THREAD_LOCAL long log_red = 0;
194	196
195	197	static void init_red_fudge()
196	198	{
	199	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	200
197	201	long i;
198	202
199	203	log_red = long(0.50*NTL_DOUBLE_PRECISION);

205	209
206	210	static void inc_red_fudge()
207	211	{
	212	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	213
208	214
209	215	red_fudge = red_fudge * 2;
210	216	log_red--;

217	223
218	224
219	225
220		NTL_THREAD_LOCAL static long verbose = 0;
221		NTL_THREAD_LOCAL static unsigned long NumSwaps = 0;
222		NTL_THREAD_LOCAL static double StartTime = 0;
223		NTL_THREAD_LOCAL static double LastTime = 0;
	226	static NTL_CHEAP_THREAD_LOCAL long verbose = 0;
	227	static NTL_CHEAP_THREAD_LOCAL unsigned long NumSwaps = 0;
	228	static NTL_CHEAP_THREAD_LOCAL double StartTime = 0;
	229	static NTL_CHEAP_THREAD_LOCAL double LastTime = 0;
224	230
225	231
226	232

273	279	xdouble b, xdouble c,
274	280	long m, long init_k, long &quit)
275	281	{
	282	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	283
276	284	long n = B.NumCols();
277	285
278	286	long i, j, k, Fc1;

284	292	xdouble *tp;
285	293
286	294
287		NTL_THREAD_LOCAL static xdouble bound = to_xdouble(0);
	295	NTL_TLS_LOCAL_INIT(xdouble, bound, (to_xdouble(0)));
288	296
289	297
290	298	if (bound == 0) {

616	624
617	625
618	626
619		NTL_THREAD_LOCAL static vec_xdouble BKZConstant;
	627	NTL_TLS_GLOBAL_DECL(vec_xdouble, BKZConstant)
620	628
621	629	static
622	630	void ComputeBKZConstant(long beta, long p)
623	631	{
	632	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	633
624	634	const double c_PI = 3.14159265358979323846264338328;
625	635	const double LogPI = 1.14472988584940017414342735135;
626	636

671	681	}
672	682	}
673	683
674		NTL_THREAD_LOCAL static vec_xdouble BKZThresh;
	684	NTL_TLS_GLOBAL_DECL(vec_xdouble, BKZThresh)
675	685
676	686	static
677	687	void ComputeBKZThresh(xdouble *c, long beta)
678	688	{
	689	NTL_TLS_GLOBAL_ACCESS(BKZConstant);
	690	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	691
679	692	BKZThresh.SetLength(beta-1);
680	693
681	694	long i;

749	762	long BKZ_XD(mat_ZZ& BB, mat_ZZ* UU, xdouble delta,
750	763	long beta, long prune, LLLCheckFct check)
751	764	{
	765	NTL_TLS_GLOBAL_ACCESS(red_fudge);
	766	NTL_TLS_GLOBAL_ACCESS(BKZThresh);
	767
	768
752	769	long m = BB.NumRows();
753	770	long n = BB.NumCols();
754	771	long m_orig = m;

-24

~~src/MakeCheckCLZL~~ less more

0
1		echo "Checking for __builtin_clzl"
2
3		cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
4
5		sh RemoveProg CheckCLZL
6		echo $1 -o CheckCLZL CheckCLZL.c CheckCLZLAux.c $2
7		$1 -o CheckCLZL CheckCLZL.c CheckCLZLAux.c $2
8
9		if test -f CheckCLZL
10		then
11		if ./CheckCLZL
12		then
13		echo "have __builtin_clzl"
14		cp ../include/NTL/have_builtin_clzl_yes.h ../include/NTL/have_builtin_clzl.h
15		sh RemoveProg CheckCLZL
16		exit 0
17		fi
18		fi
19
20		echo "DO NOT have __builtin__clzl"
21		sh RemoveProg CheckCLZL
22		exit 0
23

+26

-0

src/MakeCheckFeature less more

	0
	1	printf '*** Checking for feature: %s ' "$1"
	2
	3	echo "" > "../include/NTL/HAVE_$1.h"
	4
	5	sh RemoveProg CheckFeature
	6	echo $3 -o CheckFeature $2 $4 >> "CheckFeature.log" 2>&1
	7	$3 -o CheckFeature $2 $4 >> "CheckFeature.log" 2>&1
	8
	9	if test -f CheckFeature
	10	then
	11	if ./CheckFeature
	12	then
	13	echo "[yes]"
	14	echo "#ifndef NTL_HAVE_$1" > "../include/NTL/HAVE_$1.h"
	15	echo "#define NTL_HAVE_$1" >> "../include/NTL/HAVE_$1.h"
	16	echo "#endif" >> "../include/NTL/HAVE_$1.h"
	17	sh RemoveProg CheckFeature
	18	exit 0
	19	fi
	20	fi
	21
	22	echo "[no]"
	23	sh RemoveProg CheckFeature
	24	exit 0
	25

-24

~~src/MakeCheckLL~~ less more

0
1		echo "Checking for working LL type"
2
3		cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
4
5		sh RemoveProg CheckLL
6		echo $1 -o CheckLL CheckLL.c CheckLLAux.c $2
7		$1 -o CheckLL CheckLL.c CheckLLAux.c $2
8
9		if test -f CheckLL
10		then
11		if ./CheckLL
12		then
13		echo "have LL type"
14		cp ../include/NTL/have_LL_yes.h ../include/NTL/have_LL.h
15		sh RemoveProg CheckLL
16		exit 0
17		fi
18		fi
19
20		echo "DO NOT have LL type"
21		sh RemoveProg CheckLL
22		exit 0
23

+78

-70

src/MakeDesc.c less more

46	46	void touch_double(double* x);
47	47	void touch_ldouble(long double* x);
48	48
49
50
51
52		double power2(long k)
53		{
54		long i;
55		double res;
56
57		res = 1;
58
59		for (i = 1; i <= k; i++)
60		res = res * 2;
61
62		return res;
63		}
64
	49	double sum_double(double *x, long n);
	50
	51	double fma_test(double a, double b, double c);
	52
	53
	54	double power2(long k);
	55
	56
	57	long FMADetected(long dp)
	58	{
	59	double x = power2(0) + power2(dp-1);
	60	double y = power2(0) + power2(dp-1);
	61
	62	touch_double(&x);
	63	touch_double(&y);
	64
	65	double z = x*y;
	66	touch_double(&z);
	67	z = -z;
	68	touch_double(&z);
	69
	70	double lo = fma_test(x, y, z);
	71	return lo != 0;
	72	}
65	73
66	74	long DoubleRounding(long dp)
67	75	{
68	76	double a = power2(dp-1) + 1;
69	77	double b = (power2(dp)-1)/power2(dp+1);
70		register double x = a + b;
71		double y = x;
72
73		touch_double(&y);
74
75		if (y != power2(dp-1) + 1)
	78
	79	double vec[2];
	80	vec[0] = a;
	81	vec[1] = b;
	82
	83	double sum = sum_double(vec, 2);
	84
	85	touch_double(&sum);
	86
	87	if (sum != a)
76	88	return 1;
77	89	else
78	90	return 0;

97	109	eps *= 1.0/2.0;
98	110	tmp = 1.0 + eps;
99	111	touch_double(&tmp);
100		res = tmp - one;
101		} while (res == eps);
102
103		return k;
104		}
105
106		long DoublePrecision1()
107		{
108		double eps, one, res;
109		long k;
110
111		one = val_double(1.0);
112		eps = val_double(1.0);
113
114		k = 0;
115
116		do {
117		register double tmp;
118
119		k++;
120		eps *= 1.0/2.0;
121		tmp = 1.0 + eps;
122	112	res = tmp - one;
123	113	} while (res == eps);
124	114

763	753	int main()
764	754	{
765	755	long bpl, bpi, bpt, rs_arith, nbits, wnbits;
766		long dp, dp1, dr;
	756	long dp, dr;
	757	long fma_detected;
767	758	long ldp;
768	759	FILE *f;
769	760	long warnings = 0;

839	830	}
840	831
841	832	/*
842		* check that ints are bigger than chars.
843		*/
844
845		if (bpi <= CHAR_BIT) {
846		fprintf(stderr, "BAD NEWS: int type must be longer than char type.\n");
847		return 1;
848		}
849
	833	* check that there are 8 bits in a char. This is a POSIX requirement.
	834	*/
	835
	836	if (CHAR_BIT != 8) {
	837	fprintf(stderr, "BAD NEWS: char type must have 8 bits.\n");
	838	return 1;
	839	}
	840
	841
	842	/*
	843	* check that bpi is a multiple of 8.
	844	*/
	845
	846	if (bpi % 8 != 0) {
	847	fprintf(stderr, "BAD NEWS: int type must be multiple of 8 bits.\n");
	848	return 1;
	849	}
850	850
851	851
852	852	/*

854	854	*/
855	855
856	856	if (bpl % 8 != 0) {
857		fprintf(stderr, "BAD NEWS: word size must be multiple of 8 bits.\n");
858		return 1;
859		}
860
861
	857	fprintf(stderr, "BAD NEWS: long type must be multiple of 8 bits.\n");
	858	return 1;
	859	}
862	860
863	861
864	862	/*

995	993	* This test almost always yields the correct result --- if not,
996	994	* you will have to set the NTL_EXT_DOUBLE in "mach_desc.h"
997	995	* by hand.
998		*
999		* The test effectively proves that in-register doubles are wide
1000		* if dp1 > dp \|\| dr.
1001		*/
1002
1003
1004		dp1 = DoublePrecision1();
	996	*/
	997
	998
1005	999	dr = DoubleRounding(dp);
	1000
	1001
	1002	/*
	1003	* Next, we check if the platform uses FMA (fused multiply add),
	1004	* even across statement boundaries.
	1005	*/
	1006
	1007	fma_detected = FMADetected(dp);
	1008
1006	1009
1007	1010
1008	1011	/*

1069	1072	fprintf(stderr, "long double precision = %ld\n", ldp);
1070	1073	fprintf(stderr, "NBITS (maximum) = %ld\n", nbits);
1071	1074	fprintf(stderr, "WNBITS (maximum) = %ld\n", wnbits);
1072		fprintf(stderr, "register double precision = %ld\n", dp1);
1073	1075	fprintf(stderr, "double rounding detected = %s\n", yn_vec[dr]);
1074
1075		if (((dp1 > dp) \|\| dr) && GNUC_INTEL)
	1076	fprintf(stderr, "FMA detected = %s\n", yn_vec[fma_detected]);
	1077
	1078	if (dr && GNUC_INTEL)
1076	1079	fprintf(stderr, "-- auto x86 fix\n");
1077	1080
1078	1081	if (dp != 53) {

1096	1099
1097	1100	#endif
1098	1101
1099		if (((dp1 > dp) \|\| dr) && !GNUC_INTEL) {
	1102	if (dr && !GNUC_INTEL) {
1100	1103	warnings = 1;
1101	1104	fprintf(stderr, "\n\nWARNING:\n\n");
1102	1105	fprintf(stderr, "This platform has extended double precision registers.\n");

1171	1174	fprintf(f, "#define NTL_QUAD_FLOAT_SPLIT (");
1172	1175	print2k(f, dp - (dp/2), bpl);
1173	1176	fprintf(f, "+1.0)\n");
1174		fprintf(f, "#define NTL_EXT_DOUBLE (%d)\n", ((dp1 > dp) \|\| dr));
	1177	fprintf(f, "#define NTL_EXT_DOUBLE (%ld)\n", dr);
	1178
	1179	fprintf(f, "#define NTL_FMA_DETECTED (%ld)\n", fma_detected);
	1180
	1181
	1182
1175	1183	print_BB_mul_code(f, bpl);
1176	1184	print_BB_sqr_code(f, bpl);
1177	1185	print_BB_rev_code(f, bpl);

+30

-0

src/MakeDescAux.c less more

23	23	void touch_double(double* x) {}
24	24	void touch_ldouble(long double* x) {}
25	25
	26	double sum_double(double *x, long n)
	27	{
	28	long i;
	29	double acc = 0;
	30
	31	for (i = 0; i < n; i++)
	32	acc += x[i];
	33
	34	return acc;
	35	}
	36
	37	double fma_test(double a, double b, double c)
	38	{
	39	double t1 = a*b;
	40	double t2 = t1 + c;
	41	return t2;
	42	}
	43
	44	double power2(long k)
	45	{
	46	long i;
	47	double res;
	48
	49	res = 1;
	50
	51	for (i = 1; i <= k; i++)
	52	res = res * 2;
	53
	54	return res;
	55	}

-2

src/MakeGetPID less more

7	7
8	8
9	9	sh RemoveProg TestGetPID
10		echo $1 -o TestGetPID TestGetPID.c GetPID1.c $2
11		$1 -o TestGetPID TestGetPID.c GetPID1.c $2
	10	echo $1 -o TestGetPID TestGetPID.c GetPID1.c $2 >> "CheckFeature.log" 2>&1
	11	$1 -o TestGetPID TestGetPID.c GetPID1.c $2 >> "CheckFeature.log" 2>&1
12	12
13	13	if test -f TestGetPID
14	14	then

+14

-14

src/MakeGetTime less more

7	7
8	8
9	9	sh RemoveProg TestGetTime
10		echo $1 -o TestGetTime TestGetTime.c GetTime1.c $2
11		$1 -o TestGetTime TestGetTime.c GetTime1.c $2
	10	echo $1 -o TestGetTime TestGetTime.c GetTime1.c $2 >> "CheckFeature.log" 2>&1
	11	$1 -o TestGetTime TestGetTime.c GetTime1.c $2 >> "CheckFeature.log" 2>&1
12	12
13	13	if test -f TestGetTime
14	14	then
15		if ./TestGetTime 1 1048576 1048575
	15	if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
16	16	then
17	17	cp GetTime1.c GetTime.c
18	18	echo "using GetTime1.c"

22	22
23	23
24	24	sh RemoveProg TestGetTime
25		echo $1 -o TestGetTime TestGetTime.c GetTime2.c $2
26		$1 -o TestGetTime TestGetTime.c GetTime2.c $2
	25	echo $1 -o TestGetTime TestGetTime.c GetTime2.c $2 >> "CheckFeature.log" 2>&1
	26	$1 -o TestGetTime TestGetTime.c GetTime2.c $2 >> "CheckFeature.log" 2>&1
27	27
28	28	if test -f TestGetTime
29	29	then
30		if ./TestGetTime 1 1048576 1048575
	30	if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
31	31	then
32	32	cp GetTime2.c GetTime.c
33	33	echo "using GetTime2.c"

36	36	fi
37	37
38	38	sh RemoveProg TestGetTime
39		echo $1 -o TestGetTime TestGetTime.c GetTime3.c $2
40		$1 -o TestGetTime TestGetTime.c GetTime3.c $2
	39	echo $1 -o TestGetTime TestGetTime.c GetTime3.c $2 >> "CheckFeature.log" 2>&1
	40	$1 -o TestGetTime TestGetTime.c GetTime3.c $2 >> "CheckFeature.log" 2>&1
41	41
42	42	if test -f TestGetTime
43	43	then
44		if ./TestGetTime 1 1048576 1048575
	44	if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
45	45	then
46	46	cp GetTime3.c GetTime.c
47	47	echo "using GetTime3.c"

50	50	fi
51	51
52	52	sh RemoveProg TestGetTime
53		echo $1 -o TestGetTime TestGetTime.c GetTime4.c $2
54		$1 -o TestGetTime TestGetTime.c GetTime4.c $2
	53	echo $1 -o TestGetTime TestGetTime.c GetTime4.c $2 >> "CheckFeature.log" 2>&1
	54	$1 -o TestGetTime TestGetTime.c GetTime4.c $2 >> "CheckFeature.log" 2>&1
55	55
56	56
57	57	if test -f TestGetTime
58	58	then
59		if ./TestGetTime 1 1048576 1048575
	59	if ./TestGetTime 1 1048576 1048575 >> "CheckFeature.log" 2>&1
60	60	then
61	61	cp GetTime4.c GetTime.c
62	62	echo "using GetTime4.c"

65	65	fi
66	66
67	67	sh RemoveProg TestGetTime
68		echo $1 -o TestGetTime TestGetTime.c GetTime5.c $2
69		$1 -o TestGetTime TestGetTime.c GetTime5.c $2
	68	echo $1 -o TestGetTime TestGetTime.c GetTime5.c $2 >> "CheckFeature.log" 2>&1
	69	$1 -o TestGetTime TestGetTime.c GetTime5.c $2 >> "CheckFeature.log" 2>&1
70	70
71	71
72	72	if test -f TestGetTime

-1

src/MulTimeTest.c less more

61	61	}
62	62
63	63	#endif
	64
	65	SetSeed(ZZ(0));
64	66
65	67	long i, k;
66	68

127	129
128	130
129	131	iter = iter/2;
130		iter = long((2/t)*iter) + 1;
	132	iter = long((3/t)*iter) + 1;
131	133
132	134	double tvec[5];
133	135	long w;

-14

src/NOTES less more

0	0
1		============================
2	1
3	2	FIXME: maybe it would make more sense to take the +1/-1 logic
4	3	out of [cg]_lip_impl block_construct routines and just put it in

17	16	* add template functions clear(), to clear multiple
18	17	entries in a Vec or Poly. The important thing is
19	18	to provide specialized ones for Vec<GF2> and GF2X.
20		* RandomBnd_long is too slow...it is not a great idea
21		to work with ZZ's for large n...better strategy would be
22		simple "expected 2 iterations" strategy.
23	19
24	20
25	21

42	38	make sure these changes are implemented in the template files
43	39	mfile and cfile, and then run:
44	40
45		./configure --nowrite
46		cp mfileout def_makefile
47		cp cfileout ../include/NTL/def_config.h
48
49		- run:
50
51		NOTE: try executing
52	41	export COPYFILE_DISABLE=1
53		beforehand
54
55
56	42	make ppdoc
57	43	make ppclean
58	44	make package

+45

-21

src/Poly1TimeTest.c less more

75	75
76	76	#endif
77	77
	78	SetSeed(ZZ(0));
	79
78	80
79	81	long n, k;
80	82

84	86	ZZ p;
85	87
86	88	RandomLen(p, k);
	89	if (!IsOdd(p)) p++;
87	90
88	91
89	92	ZZ_p::init(p); // initialization

135	138
136	139	for (r = 0; r < nprimes; r++) UseFFTPrime(r);
137	140
138		vec_long aa[nprimes], AA[nprimes];
	141	vec_long A1[nprimes], A2[nprimes];
	142	vec_long B1[nprimes], B2[nprimes];
139	143
140	144	for (r = 0; r < nprimes; r++) {
141		aa[r].SetLength(N);
142		AA[r].SetLength(N);
143
144		for (i = 0; i < N; i++)
145		aa[r][i] = RandomBnd(GetFFTPrime(r));
146
147
148		FFTFwd(AA[r].elts(), aa[r].elts(), L, r);
149		FFTRev1(AA[r].elts(), AA[r].elts(), L, r);
	145	A1[r].SetLength(N);
	146	A2[r].SetLength(N);
	147	B1[r].SetLength(N);
	148	B2[r].SetLength(N);
	149
	150	for (i = 0; i < N; i++) {
	151	A1[r][i] = RandomBnd(GetFFTPrime(r));
	152	A2[r][i] = RandomBnd(GetFFTPrime(r));
	153	}
	154	}
	155
	156	for (r = 0; r < nprimes; r++) {
	157	long *A1p = A1[r].elts();
	158	long *A2p = A2[r].elts();
	159	long *B1p = B1[r].elts();
	160	long *B2p = B2[r].elts();
	161	long q = GetFFTPrime(r);
	162	mulmod_t qinv = GetFFTPrimeInv(r);
	163
	164	FFTFwd(B1p, A1p, L, r);
	165	FFTFwd(B2p, A2p, L, r);
	166	for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
	167	FFTRev1(B1p, B1p, L, r);
150	168	}
151	169
152	170	iter = 1;

155	173	t = GetTime();
156	174	for (j = 0; j < iter; j++) {
157	175	for (r = 0; r < nprimes; r++) {
158		long *AAp = AA[r].elts();
159		long *aap = aa[r].elts();
	176	long *A1p = A1[r].elts();
	177	long *A2p = A2[r].elts();
	178	long *B1p = B1[r].elts();
	179	long *B2p = B2[r].elts();
160	180	long q = GetFFTPrime(r);
161	181	mulmod_t qinv = GetFFTPrimeInv(r);
162	182
163		FFTFwd(AAp, aap, L, r);
164		FFTRev1(AAp, aap, L, r);
165		for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv);
	183	FFTFwd(B1p, A1p, L, r);
	184	FFTFwd(B2p, A2p, L, r);
	185	for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
	186	FFTRev1(B1p, B1p, L, r);
166	187	}
167	188	}
168	189	t = GetTime() - t;

171	192
172	193	iter = iter/2;
173	194
174		iter = long((1.5/t)*iter) + 1;
	195	iter = long((3/t)*iter) + 1;
175	196
176	197
177	198	double tvec[5];

181	202	t = GetTime();
182	203	for (j = 0; j < iter; j++) {
183	204	for (r = 0; r < nprimes; r++) {
184		long *AAp = AA[r].elts();
185		long *aap = aa[r].elts();
	205	long *A1p = A1[r].elts();
	206	long *A2p = A2[r].elts();
	207	long *B1p = B1[r].elts();
	208	long *B2p = B2[r].elts();
186	209	long q = GetFFTPrime(r);
187	210	mulmod_t qinv = GetFFTPrimeInv(r);
188	211
189		FFTFwd(AAp, aap, L, r);
190		FFTRev1(AAp, aap, L, r);
191		for (i = 0; i < N; i++) AAp[i] = NormalizedMulMod(AAp[i], aap[i], q, qinv);
	212	FFTFwd(B1p, A1p, L, r);
	213	FFTFwd(B2p, A2p, L, r);
	214	for (i = 0; i < N; i++) B1p[i] = NormalizedMulMod(B1p[i], B2p[i], q, qinv);
	215	FFTRev1(B1p, B1p, L, r);
192	216	}
193	217	}
194	218	t = GetTime() - t;

+186

-0

src/Poly2TimeTest.c less more

	0
	1	#include <NTL/ZZ_pX.h>
	2
	3	#include <cstdio>
	4
	5	NTL_CLIENT
	6
	7
	8	double clean_data(double *t)
	9	{
	10	double x, y, z;
	11	long i, ix, iy, n;
	12
	13	x = t[0]; ix = 0;
	14	y = t[0]; iy = 0;
	15
	16	for (i = 1; i < 5; i++) {
	17	if (t[i] < x) {
	18	x = t[i];
	19	ix = i;
	20	}
	21	if (t[i] > y) {
	22	y = t[i];
	23	iy = i;
	24	}
	25	}
	26
	27	z = 0; n = 0;
	28	for (i = 0; i < 5; i++) {
	29	if (i != ix && i != iy) z+= t[i], n++;
	30	}
	31
	32	z = z/n;
	33
	34	return z;
	35	}
	36
	37	void print_flag()
	38	{
	39
	40
	41	#if (defined(NTL_TBL_REM))
	42	printf("TBL_REM ");
	43	#elif (defined(NTL_TBL_REM_LL))
	44	printf("TBL_REM_LL ");
	45	#else
	46	printf("DEFAULT ");
	47	#endif
	48
	49
	50	printf("\n");
	51
	52	}
	53
	54
	55	int main()
	56	{
	57
	58	#if (defined(NTL_TBL_REM) && defined(NTL_GMP_LIP) && !(defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG))
	59	{
	60	printf("999999999999999 ");
	61	print_flag();
	62	return 0;
	63	}
	64
	65	#endif
	66
	67	#if (defined(NTL_TBL_REM_LL) && !defined(NTL_GMP_LIP) && !defined(NTL_HAVE_LL_TYPE))
	68	{
	69	printf("999999999999999 ");
	70	print_flag();
	71	return 0;
	72	}
	73
	74	#endif
	75
	76	SetSeed(ZZ(0));
	77
	78	long n, k;
	79
	80	n = 200;
	81	k = 10*NTL_ZZ_NBITS;
	82
	83	ZZ p;
	84
	85	RandomLen(p, k);
	86	if (!IsOdd(p)) p++;
	87
	88
	89	ZZ_p::init(p); // initialization
	90
	91	ZZ_pX f, g, h, r1, r2, r3;
	92
	93	random(g, n); // g = random polynomial of degree < n
	94	random(h, n); // h = " "
	95	random(f, n); // f = " "
	96
	97	SetCoeff(f, n); // Sets coefficient of X^n to 1
	98
	99	// For doing arithmetic mod f quickly, one must pre-compute
	100	// some information.
	101
	102	ZZ_pXModulus F;
	103	build(F, f);
	104
	105	PlainMul(r1, g, h); // this uses classical arithmetic
	106	PlainRem(r1, r1, f);
	107
	108	MulMod(r2, g, h, F); // this uses the FFT
	109
	110	MulMod(r3, g, h, f); // uses FFT, but slower
	111
	112	// compare the results...
	113
	114	if (r1 != r2) {
	115	printf("999999999999999 ");
	116	print_flag();
	117	return 0;
	118	}
	119	else if (r1 != r3) {
	120	printf("999999999999999 ");
	121	print_flag();
	122	return 0;
	123	}
	124
	125	double t;
	126	long i;
	127	long iter;
	128
	129	n = 1024;
	130	k = 1600;
	131	RandomLen(p, k);
	132	if (!IsOdd(p)) p++;
	133
	134	ZZ_p::init(p);
	135
	136	ZZ_pX a;
	137	random(a, n);
	138	long da = deg(a);
	139
	140	ZZ_pXModRep modrep;
	141	ToZZ_pXModRep(modrep, a, 0, da);
	142
	143	iter = 1;
	144
	145	do {
	146	t = GetTime();
	147	for (i = 0; i < iter; i++) {
	148	ToZZ_pXModRep(modrep, a, 0, da);
	149	}
	150	t = GetTime() - t;
	151	iter = 2*iter;
	152	} while(t < 1);
	153
	154	iter = iter/2;
	155
	156	iter = long((3/t)*iter) + 1;
	157
	158	double tvec[5];
	159	long w;
	160
	161	for (w = 0; w < 5; w++) {
	162	t = GetTime();
	163	for (i = 0; i < iter; i++) {
	164	ToZZ_pXModRep(modrep, a, 0, da);
	165	}
	166	t = GetTime() - t;
	167	tvec[w] = t;
	168	}
	169
	170
	171	t = clean_data(tvec);
	172
	173	t = floor((t/iter)*1e12);
	174
	175	if (t < 0 \|\| t >= 1e15)
	176	printf("999999999999999 ");
	177	else
	178	printf("%015.0f ", t);
	179
	180	printf(" [%ld] ", iter);
	181
	182	print_flag();
	183
	184	return 0;
	185	}

+188

-0

src/Poly3TimeTest.c less more

	0
	1	#include <NTL/ZZ_pX.h>
	2
	3	#include <cstdio>
	4
	5	NTL_CLIENT
	6
	7
	8	double clean_data(double *t)
	9	{
	10	double x, y, z;
	11	long i, ix, iy, n;
	12
	13	x = t[0]; ix = 0;
	14	y = t[0]; iy = 0;
	15
	16	for (i = 1; i < 5; i++) {
	17	if (t[i] < x) {
	18	x = t[i];
	19	ix = i;
	20	}
	21	if (t[i] > y) {
	22	y = t[i];
	23	iy = i;
	24	}
	25	}
	26
	27	z = 0; n = 0;
	28	for (i = 0; i < 5; i++) {
	29	if (i != ix && i != iy) z+= t[i], n++;
	30	}
	31
	32	z = z/n;
	33
	34	return z;
	35	}
	36
	37	void print_flag()
	38	{
	39
	40
	41	#if (defined(NTL_CRT_ALTCODE))
	42	printf("CRT_ALTCODE ");
	43	#else
	44	printf("DEFAULT ");
	45	#endif
	46
	47
	48	printf("\n");
	49
	50	}
	51
	52
	53	int main()
	54	{
	55
	56	#if (defined(NTL_CRT_ALTCODE) && !(defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG))
	57
	58	{
	59	printf("999999999999999 ");
	60	print_flag();
	61	return 0;
	62	}
	63
	64
	65	#endif
	66
	67	SetSeed(ZZ(0));
	68
	69	long n, k;
	70
	71	n = 1024;
	72	k = 30*NTL_SP_NBITS;
	73
	74	ZZ p;
	75
	76	RandomLen(p, k);
	77	if (!IsOdd(p)) p++;
	78
	79
	80	ZZ_p::init(p); // initialization
	81
	82	ZZ_pX f, g, h, r1, r2, r3;
	83
	84	random(g, n); // g = random polynomial of degree < n
	85	random(h, n); // h = " "
	86	random(f, n); // f = " "
	87
	88	SetCoeff(f, n); // Sets coefficient of X^n to 1
	89
	90	// For doing arithmetic mod f quickly, one must pre-compute
	91	// some information.
	92
	93	ZZ_pXModulus F;
	94	build(F, f);
	95
	96	PlainMul(r1, g, h); // this uses classical arithmetic
	97	PlainRem(r1, r1, f);
	98
	99	MulMod(r2, g, h, F); // this uses the FFT
	100
	101	MulMod(r3, g, h, f); // uses FFT, but slower
	102
	103	// compare the results...
	104
	105	if (r1 != r2) {
	106	printf("999999999999999 ");
	107	print_flag();
	108	return 0;
	109	}
	110	else if (r1 != r3) {
	111	printf("999999999999999 ");
	112	print_flag();
	113	return 0;
	114	}
	115
	116	double t;
	117	long i;
	118	long iter;
	119
	120	ZZ_pX a, b, c;
	121	random(a, n);
	122	random(b, n);
	123	long da = deg(a);
	124	long db = deg(b);
	125	long dc = da + db;
	126	long l = NextPowerOfTwo(dc+1);
	127
	128	FFTRep arep, brep, crep;
	129	ToFFTRep(arep, a, l, 0, da);
	130	ToFFTRep(brep, b, l, 0, db);
	131
	132	mul(crep, arep, brep);
	133
	134	ZZ_pXModRep modrep;
	135	FromFFTRep(modrep, crep);
	136
	137	FromZZ_pXModRep(c, modrep, 0, dc);
	138
	139	iter = 1;
	140
	141	do {
	142	t = GetTime();
	143	for (i = 0; i < iter; i++) {
	144	FromZZ_pXModRep(c, modrep, 0, dc);
	145	}
	146	t = GetTime() - t;
	147	iter = 2*iter;
	148	} while(t < 1);
	149
	150	iter = iter/2;
	151
	152	iter = long((3/t)*iter) + 1;
	153
	154	double tvec[5];
	155	long w;
	156
	157	for (w = 0; w < 5; w++) {
	158	t = GetTime();
	159	for (i = 0; i < iter; i++) {
	160	FromZZ_pXModRep(c, modrep, 0, dc);
	161	}
	162	t = GetTime() - t;
	163	tvec[w] = t;
	164	}
	165
	166
	167	t = clean_data(tvec);
	168
	169	t = floor((t/iter)*1e12);
	170
	171	// The following is just to test some tuning Wizard logic --
	172	// be sure to get rid of this!!
	173	#if (defined(NTL_CRT_ALTCODE))
	174	// t *= 1.12;
	175	#endif
	176
	177	if (t < 0 \|\| t >= 1e15)
	178	printf("999999999999999 ");
	179	else
	180	printf("%015.0f ", t);
	181
	182	printf(" [%ld] ", iter);
	183
	184	print_flag();
	185
	186	return 0;
	187	}

-176

~~src/PolyTimeTest.c~~ less more

0
1		#include <NTL/ZZ_pX.h>
2
3		#include <cstdio>
4
5		NTL_CLIENT
6
7
8		double clean_data(double *t)
9		{
10		double x, y, z;
11		long i, ix, iy, n;
12
13		x = t[0]; ix = 0;
14		y = t[0]; iy = 0;
15
16		for (i = 1; i < 5; i++) {
17		if (t[i] < x) {
18		x = t[i];
19		ix = i;
20		}
21		if (t[i] > y) {
22		y = t[i];
23		iy = i;
24		}
25		}
26
27		z = 0; n = 0;
28		for (i = 0; i < 5; i++) {
29		if (i != ix && i != iy) z+= t[i], n++;
30		}
31
32		z = z/n;
33
34		return z;
35		}
36
37		void print_flag()
38		{
39
40
41		#if (defined(NTL_TBL_REM))
42		printf("TBL_REM ");
43		#elif (defined(NTL_TBL_REM_LL))
44		printf("TBL_REM_LL ");
45		#else
46		printf("DEFAULT ");
47		#endif
48
49
50		printf("\n");
51
52		}
53
54
55		int main()
56		{
57
58		#if (defined(NTL_TBL_REM) && defined(NTL_GMP_LIP))
59
60		if (sizeof(NTL_ULL_TYPE) != 2*sizeof(long) \|\|
61		NTL_ZZ_NBITS != NTL_BITS_PER_LONG) {
62		printf("999999999999999 ");
63		print_flag();
64		return 0;
65		}
66
67
68		#endif
69
70		long n, k;
71
72		n = 200;
73		k = 10*NTL_ZZ_NBITS;
74
75		ZZ p;
76
77		RandomLen(p, k);
78
79
80		ZZ_p::init(p); // initialization
81
82		ZZ_pX f, g, h, r1, r2, r3;
83
84		random(g, n); // g = random polynomial of degree < n
85		random(h, n); // h = " "
86		random(f, n); // f = " "
87
88		SetCoeff(f, n); // Sets coefficient of X^n to 1
89
90		// For doing arithmetic mod f quickly, one must pre-compute
91		// some information.
92
93		ZZ_pXModulus F;
94		build(F, f);
95
96		PlainMul(r1, g, h); // this uses classical arithmetic
97		PlainRem(r1, r1, f);
98
99		MulMod(r2, g, h, F); // this uses the FFT
100
101		MulMod(r3, g, h, f); // uses FFT, but slower
102
103		// compare the results...
104
105		if (r1 != r2) {
106		printf("999999999999999 ");
107		print_flag();
108		return 0;
109		}
110		else if (r1 != r3) {
111		printf("999999999999999 ");
112		print_flag();
113		return 0;
114		}
115
116		double t;
117		long i;
118		long iter;
119
120		n = 1024;
121		k = 1024;
122		RandomLen(p, k);
123
124		ZZ_p::init(p);
125
126		ZZ_pX j1, j2, j3;
127
128		random(j1, n);
129		random(j2, n);
130
131		mul(j3, j1, j2);
132
133		iter = 1;
134
135		do {
136		t = GetTime();
137		for (i = 0; i < iter; i++) {
138		FFTMul(j3, j1, j2);
139		}
140		t = GetTime() - t;
141		iter = 2*iter;
142		} while(t < 1);
143
144		iter = iter/2;
145
146		iter = long((2/t)*iter) + 1;
147
148		double tvec[5];
149		long w;
150
151		for (w = 0; w < 5; w++) {
152		t = GetTime();
153		for (i = 0; i < iter; i++) {
154		FFTMul(j3, j1, j2);
155		}
156		t = GetTime() - t;
157		tvec[w] = t;
158		}
159
160
161		t = clean_data(tvec);
162
163		t = floor((t/iter)*1e12);
164
165		if (t < 0 \|\| t >= 1e15)
166		printf("999999999999999 ");
167		else
168		printf("%015.0f ", t);
169
170		printf(" [%ld] ", iter);
171
172		print_flag();
173
174		return 0;
175		}

+67

-20

src/QuickTest.c less more

100	100	}
101	101
102	102
103		ZZX SSMul(const ZZX& a, const ZZX& b)
	103
	104	ZZX KarMul(const ZZX& a, const ZZX& b)
104	105	{
105	106	ZZX res;
106		SSMul(res, a, b);
	107	KarMul(res, a, b);
107	108	return res;
108	109	}
	110
109	111
110	112
111	113	int main()

120	122	cerr << "NTL_ZZ_NBITS = " << NTL_ZZ_NBITS << "\n";
121	123	cerr << "NTL_SP_NBITS = " << NTL_SP_NBITS << "\n";
122	124
	125	#ifdef NTL_HAVE_LL_TYPE
	126	cerr << "NTL_HAVE_LL_TYPE\n";
	127	#endif
	128
	129	#ifdef NTL_HAVE_BUILTIN_CLZL
	130	cerr << "NTL_HAVE_BUILTIN_CLZL\n";
	131	#endif
	132
	133	#ifdef NTL_HAVE_AVX
	134	cerr << "NTL_HAVE_AVX\n";
	135	#endif
	136
	137	#ifdef NTL_HAVE_FMA
	138	cerr << "NTL_HAVE_FMA\n";
	139	#endif
	140
	141
	142
123	143	#ifdef NTL_LONGDOUBLE_SP_MULMOD
124	144	cerr << "NTL_LONGDOUBLE_SP_MULMOD\n";
125	145	#endif

151	171	cerr << "NTL_THREADS\n";
152	172	#endif
153	173
	174	#ifdef NTL_DISABLE_TLS_HACK
	175	cerr << "NTL_DISABLE_TLS_HACK\n";
	176	#endif
	177
	178	#ifdef NTL_ENABLE_TLS_HACK
	179	cerr << "NTL_ENABLE_TLS_HACK\n";
	180	#endif
	181
154	182
155	183	#ifdef NTL_EXCEPTIONS
156	184	cerr << "NTL_EXCEPTIONS\n";
157	185	#endif
158	186
	187	#ifdef NTL_THREAD_BOOST
	188	cerr << "NTL_THREAD_BOOST\n";
	189	#endif
	190
159	191
160	192	#ifdef NTL_LEGACY_SP_MULMOD
161		cout << "NTL_LEGACY_SP_MULMOD\n";
	193	cerr << "NTL_LEGACY_SP_MULMOD\n";
162	194	#endif
163	195
164	196
165	197	#ifdef NTL_DISABLE_LONGDOUBLE
166		cout << "NTL_DISABLE_LONGDOUBLE\n";
	198	cerr << "NTL_DISABLE_LONGDOUBLE\n";
167	199	#endif
168	200
169	201
170	202	#ifdef NTL_DISABLE_LONGLONG
171		cout << "NTL_DISABLE_LONGLONG\n";
	203	cerr << "NTL_DISABLE_LONGLONG\n";
	204	#endif
	205
	206	#ifdef NTL_DISABLE_LL_ASM
	207	cerr << "NTL_DISABLE_LL_ASM\n";
	208	#endif
	209
	210	#ifdef NTL_MAXIMIZE_SP_NBITS
	211	cerr << "NTL_MAXIMIZE_SP_NBITS\n";
172	212	#endif
173	213
174	214

183	223	cerr << "NTL_GF2X_LIB\n";
184	224	#endif
185	225
186
187		#ifdef NTL_PCLMUL
188		cerr << "NTL_PCLMUL\n";
189		#endif
190	226
191	227
192	228	#ifdef NTL_LONG_LONG_TYPE

261	297
262	298
263	299	#ifdef NTL_FFT_BIGTAB
264		cout << "NTL_FFT_BIGTAB\n";
	300	cerr << "NTL_FFT_BIGTAB\n";
265	301	#endif
266	302
267	303	#ifdef NTL_FFT_LAZYMUL
268		cout << "NTL_FFT_LAZYMUL\n";
269		#endif
270
271
272
	304	cerr << "NTL_FFT_LAZYMUL\n";
	305	#endif
273	306
274	307
275	308	#ifdef NTL_TBL_REM

281	314	cerr << "NTL_TBL_REM_LL\n";
282	315	#endif
283	316
	317	#ifdef NTL_CRT_ALTCODE
	318	cerr << "NTL_CRT_ALTCODE\n";
	319	#endif
	320
	321	#ifdef NTL_CRT_ALTCODE_SMALL
	322	cerr << "NTL_CRT_ALTCODE_SMALL\n";
	323	#endif
284	324
285	325	#ifdef NTL_GF2X_ALTCODE
286	326	cerr << "NTL_GF2X_ALTCODE\n";

290	330	cerr << "NTL_GF2X_ALTCODE1\n";
291	331	#endif
292	332
293
294	333	#ifdef NTL_GF2X_NOINLINE
295	334	cerr << "NTL_GF2X_NOINLINE\n";
296	335	#endif
	336
	337	#ifdef NTL_PCLMUL
	338	cerr << "NTL_PCLMUL\n";
	339	#endif
	340
297	341
298	342	cerr << "\n\n";
299	343

312	356	cerr << ".";
313	357	RandomLen(p, k);
314	358	ZZ_p::init(p);
	359
315	360
316	361	ZZ_pX a, b, c, c1;
317	362

320	365	random(b, n);
321	366
322	367	FFTMul(c, a, b);
323
324		c1 = conv<ZZ_pX>( SSMul( conv<ZZX>(a), conv<ZZX>(b) ) );
	368	//cerr << ZZ_pInfo->FFTInfo->NumPrimes;
	369
	370	c1 = conv<ZZ_pX>( KarMul( conv<ZZX>(a), conv<ZZX>(b) ) );
325	371
326	372	if (c1 != c) {
327	373	cerr << "ZZ_pX mul failed!\n";

408	454	RandomLen(p, k);
409	455
410	456	ZZ_p::init(p);
	457	if (!IsOdd(p)) p++;
411	458
412	459	ZZ_pX j1, j2, j3;
413	460

417	464	mul(j3, j1, j2);
418	465
419	466	t = GetTime();
420		for (i = 0; i < 100; i++) mul(j3, j1, j2);
	467	for (i = 0; i < 500; i++) mul(j3, j1, j2);
421	468	t = GetTime()-t;
422	469
423	470	cerr << "time to multiply degree 1023 polynomials\n modulo a 1024-bit number: ";
424		cerr << (t/100) << "s";
	471	cerr << (t/500) << "s";
425	472	cerr << "\n";
426	473
427	474	GF2X_time();

+42

-35

src/RR.c less more

13	13	// priority right now.
14	14
15	15
16		NTL_THREAD_LOCAL
	16	NTL_CHEAP_THREAD_LOCAL
17	17	long RR::prec = 150;
18	18
19	19	void RR::SetPrecision(long p)

27	27	prec = p;
28	28	}
29	29
30		NTL_THREAD_LOCAL
	30	NTL_CHEAP_THREAD_LOCAL
31	31	long RR::oprec = 10;
32	32
33	33	void RR::SetOutputPrecision(long p)

107	107
108	108	void random(RR& z)
109	109	{
110		NTL_THREAD_LOCAL static RR t;
	110	NTL_TLS_LOCAL(RR, t);
111	111	RandomBits(t.x, RR::prec);
112	112	t.e = -RR::prec;
113	113	normalize(z, t);

176	176
177	177	void add(RR& z, const RR& a, const RR& b)
178	178	{
179		NTL_THREAD_LOCAL static RR t;
	179	NTL_TLS_LOCAL(RR, t);
180	180
181	181	if (IsZero(a.x)) {
182	182	xcopy(z, b);

229	229
230	230	void sub(RR& z, const RR& a, const RR& b)
231	231	{
232		NTL_THREAD_LOCAL static RR t;
	232	NTL_TLS_LOCAL(RR, t);
233	233
234	234	if (IsZero(a.x)) {
235	235	negate(z, b);

321	321
322	322	void mul(RR& z, const RR& a, const RR& b)
323	323	{
324		NTL_THREAD_LOCAL static RR t;
	324	NTL_TLS_LOCAL(RR, t);
325	325
326	326	mul(t.x, a.x, b.x);
327	327	t.e = a.e + b.e;

343	343
344	344	void sqr(RR& z, const RR& a)
345	345	{
346		NTL_THREAD_LOCAL static RR t;
	346	NTL_TLS_LOCAL(RR, t);
347	347
348	348	sqr(t.x, a.x);
349	349	t.e = a.e + a.e;

382	382	long k = RR::prec - la + lb + 1;
383	383	if (k < 0) k = 0;
384	384
385		NTL_THREAD_LOCAL static RR t;
	385	NTL_TLS_LOCAL(RR, t);
386	386	NTL_ZZRegister(A);
387	387	NTL_ZZRegister(B);
388	388	NTL_ZZRegister(R);

466	466
467	467	long compare(const RR& a, const RR& b)
468	468	{
469		NTL_THREAD_LOCAL static RR t;
	469	NTL_TLS_LOCAL(RR, t);
470	470
471	471	SubPrec(t, a, b, 1);
472	472	return sign(t);

482	482
483	483	void trunc(RR& z, const RR& a)
484	484	{
485		NTL_THREAD_LOCAL static RR t;
	485	NTL_TLS_LOCAL(RR, t);
486	486
487	487	if (a.e >= 0)
488	488	xcopy(z, a);

507	507
508	508	void floor(RR& z, const RR& a)
509	509	{
510		NTL_THREAD_LOCAL static RR t;
	510	NTL_TLS_LOCAL(RR, t);
511	511
512	512	if (a.e >= 0)
513	513	xcopy(z, a);

534	534
535	535	void ceil(RR& z, const RR& a)
536	536	{
537		NTL_THREAD_LOCAL static RR t;
	537	NTL_TLS_LOCAL(RR, t);
538	538
539	539	if (a.e >= 0)
540	540	xcopy(z, a);

582	582	return;
583	583	}
584	584
585		NTL_THREAD_LOCAL static RR t;
	585	NTL_TLS_LOCAL(RR, t);
586	586	ConvPrec(t, a, len+a.e);
587	587	xcopy(z, t);
588	588	}

696	696
697	697	int e;
698	698	double f;
699		NTL_THREAD_LOCAL static RR t;
	699	NTL_TLS_LOCAL(RR, t);
700	700
701	701	f = frexp(a, &e);
702	702

778	778	return;
779	779	}
780	780
781		NTL_THREAD_LOCAL static RR t;
	781	NTL_TLS_LOCAL(RR, t);
782	782
783	783	ConvPrec(t, a, len+a.e);
784	784

800	800	void conv(double& z, const RR& aa)
801	801	{
802	802	double x;
803		NTL_THREAD_LOCAL static RR a;
	803	NTL_TLS_LOCAL(RR, a);
804	804
805	805	ConvPrec(a, aa, NTL_DOUBLE_PRECISION);
806	806	// round to NTL_DOUBLE_PRECISION bits to avoid double overflow

814	814
815	815	void add(RR& z, const RR& a, double b)
816	816	{
817		NTL_THREAD_LOCAL static RR B;
	817	NTL_TLS_LOCAL(RR, B);
818	818	B = b;
819	819	add(z, a, B);
820	820	}

823	823
824	824	void sub(RR& z, const RR& a, double b)
825	825	{
826		NTL_THREAD_LOCAL static RR B;
	826	NTL_TLS_LOCAL(RR, B);
827	827	B = b;
828	828	sub(z, a, B);
829	829	}
830	830
831	831	void sub(RR& z, double a, const RR& b)
832	832	{
833		NTL_THREAD_LOCAL static RR A;
	833	NTL_TLS_LOCAL(RR, A);
834	834	A = a;
835	835	sub(z, A, b);
836	836	}

839	839
840	840	void mul(RR& z, const RR& a, double b)
841	841	{
842		NTL_THREAD_LOCAL static RR B;
	842	NTL_TLS_LOCAL(RR, B);
843	843	B = b;
844	844	mul(z, a, B);
845	845	}

847	847
848	848	void div(RR& z, const RR& a, double b)
849	849	{
850		NTL_THREAD_LOCAL static RR B;
	850	NTL_TLS_LOCAL(RR, B);
851	851	B = b;
852	852	div(z, a, B);
853	853	}
854	854
855	855	void div(RR& z, double a, const RR& b)
856	856	{
857		NTL_THREAD_LOCAL static RR A;
	857	NTL_TLS_LOCAL(RR, A);
858	858	A = a;
859	859	div(z, A, b);
860	860	}

862	862
863	863	void inv(RR& z, const RR& a)
864	864	{
865		NTL_THREAD_LOCAL static RR one = to_RR(1);
	865	NTL_TLS_LOCAL_INIT(RR, one, (to_RR(1)));
866	866	div(z, one, a);
867	867	}
868	868

883	883	{
884	884	if (b == 0) return sign(a);
885	885
886		NTL_THREAD_LOCAL static RR B;
	886	NTL_TLS_LOCAL(RR, B);
887	887	B = b;
888	888	return compare(a, B);
889	889	}

894	894	if (b == 0) return IsZero(a);
895	895	if (b == 1) return IsOne(a);
896	896
897		NTL_THREAD_LOCAL static RR B;
	897	NTL_TLS_LOCAL(RR, B);
898	898	B = b;
899	899	return a == B;
900	900	}

1148	1148
1149	1149	void conv(RR& z, const quad_float& a)
1150	1150	{
1151		NTL_THREAD_LOCAL static RR hi, lo, res;
	1151	NTL_TLS_LOCAL(RR, hi);
	1152	NTL_TLS_LOCAL(RR, lo);
	1153	NTL_TLS_LOCAL(RR, res);
1152	1154
1153	1155	ConvPrec(hi, a.hi, NTL_DOUBLE_PRECISION);
1154	1156	ConvPrec(lo, a.lo, NTL_DOUBLE_PRECISION);

1173	1175
1174	1176	void conv(quad_float& z, const RR& a)
1175	1177	{
1176		NTL_THREAD_LOCAL static RR a_hi, a_lo;
	1178	NTL_TLS_LOCAL(RR, a_hi);
	1179	NTL_TLS_LOCAL(RR, a_lo);
1177	1180
1178	1181	ConvPrec(a_hi, a, NTL_DOUBLE_PRECISION); // high order bits
1179	1182	SubPrec(a_lo, a, a_hi, NTL_DOUBLE_PRECISION); // low order bits

1365	1368
1366	1369	void ComputeE(RR& res)
1367	1370	{
1368		NTL_THREAD_LOCAL static long prec = 0;
1369		NTL_THREAD_LOCAL static RR e;
	1371	static NTL_CHEAP_THREAD_LOCAL long prec = 0;
	1372
	1373	NTL_TLS_LOCAL(RR, e);
1370	1374
1371	1375	RRPush push;
1372	1376	long p = RR::precision();

1469	1473
1470	1474	void ComputeLn2(RR& res)
1471	1475	{
1472		NTL_THREAD_LOCAL static long prec = 0;
1473		NTL_THREAD_LOCAL static RR ln2;
	1476	static NTL_CHEAP_THREAD_LOCAL long prec = 0;
	1477
	1478	NTL_TLS_LOCAL(RR, ln2);
1474	1479
1475	1480	RRPush push;
1476	1481	long p = RR::precision();

1554	1559
1555	1560	void ComputeLn10(RR& res)
1556	1561	{
1557		NTL_THREAD_LOCAL static long prec = 0;
1558		NTL_THREAD_LOCAL static RR ln10;
	1562	static NTL_CHEAP_THREAD_LOCAL long prec = 0;
	1563
	1564	NTL_TLS_LOCAL(RR, ln10);
1559	1565
1560	1566	RRPush push;
1561	1567	long p = RR::precision();

1790	1796
1791	1797	void ComputePi(RR& res)
1792	1798	{
1793		NTL_THREAD_LOCAL static long prec = 0;
1794		NTL_THREAD_LOCAL static RR pi;
	1799	static NTL_CHEAP_THREAD_LOCAL long prec = 0;
	1800
	1801	NTL_TLS_LOCAL(RR, pi);
1795	1802
1796	1803	RRPush push;
1797	1804	long p = RR::precision();

-0

src/ResetFeatures less more

	0
	1	echo "" > "$1/include/NTL/HAVE_LL_TYPE.h"
	2	echo "" > "$1/include/NTL/HAVE_BUILTIN_CLZL.h"
	3	echo "" > "$1/include/NTL/HAVE_AVX.h"
	4	echo "" > "$1/include/NTL/HAVE_FMA.h"

-0

src/TestScript less more

79	79	else
80	80	echo "bad MatrixTest"
81	81	fi
	82
	83	echo
	84	echo "---------------------------------"
	85	echo "making mat_lzz_pTest"
	86	make mat_lzz_pTest
	87	echo "running mat_lzz_pTest"
	88	./mat_lzz_pTest
	89	sh RemoveProg mat_lzz_pTest
	90
82	91
83	92	echo
84	93	echo "---------------------------------"

+75

-30

src/ThreadTest.c less more

2	2	#ifdef NTL_THREADS
3	3
4	4
	5	#include <NTL/ZZX.h>
5	6	#include <NTL/ZZ_pXFactoring.h>
6		#include <NTL/thread.h>
7
8		#include <thread>
9
10
	7	#include <NTL/BasicThreadPool.h>
11	8	#include <cstdio>
12	9
13	10	NTL_CLIENT
14	11
	12	#if 1
15	13
16	14
17		void task(ZZ_pContext context, ZZ_pX f, vec_pair_ZZ_pX_long v)
	15	long mobius(long n)
18	16	{
19		fprintf(stderr, "starting %s\n", CurrentThreadID().c_str());
20		context.restore();
21		CanZass(v, f);
22		fprintf(stderr, "stopping %s\n", CurrentThreadID().c_str());
	17	long p,e,arity=0;
	18	PrimeSeq s;
	19	while (n!=1)
	20	{ p=s.next();
	21	e=0;
	22	while ((n%p==0)) { n=n/p; e++; }
	23	if (e>1) { return 0; }
	24	if (e!=0) { arity^=1; }
	25	}
	26	if (arity==0) { return 1; }
	27	return -1;
23	28	}
	29
	30
	31	ZZX Cyclotomic(long N)
	32	{
	33	ZZX Num,Den,G,F;
	34	set(Num); set(Den);
	35	long m,d;
	36	for (d=1; d<=N; d++)
	37	{ if ((N%d)==0)
	38	{ clear(G);
	39	SetCoeff(G,N/d,1); SetCoeff(G,0,-1);
	40	m=mobius(d);
	41	if (m==1) { Num*=G; }
	42	else if (m==-1) { Den*=G; }
	43	}
	44	}
	45	F=Num/Den;
	46	return F;
	47	}
	48
	49	long multOrd(const ZZ& p, long m)
	50	{
	51	long pp = rem(p, m);
	52	if (GCD(pp, m) != 1) return 0;
	53
	54	long ord = 1;
	55	long val = pp;
	56	while (val != 1) {
	57	ord++;
	58	val = MulMod(val, pp, m);
	59	}
	60	return ord;
	61	}
	62
	63	#endif
	64
	65
	66
24	67
25	68
26	69	int main()
27	70	{
	71	SetSeed(ZZ(0));
	72
28	73	long NumContexts = 3;
29	74	long NumPolys = 6;
30		long n = 500;
	75	long n = 2000;
31	76
32	77	Vec<ZZ_pContext> context_vec;
33	78	context_vec.SetLength(NumContexts);
34	79
35		long i;
36		for (i = 0; i < NumContexts; i++) {
	80	for (long i = 0; i < NumContexts; i++) {
37	81	ZZ p;
38		RandomPrime(p, 150 + i*50);
	82	GenPrime(p, 150 + i*20);
39	83	context_vec[i] = ZZ_pContext(p);
40	84	}
41	85
42	86	Vec<ZZ_pX> poly_vec;
43	87	Vec<vec_pair_ZZ_pX_long> res_vec;
44		Vec< SmartPtr<thread> > thread_vec;
45	88
46	89	poly_vec.SetLength(NumPolys);
47	90	res_vec.SetLength(NumPolys);
48		thread_vec.SetLength(NumPolys);
49	91
50		for (i = 0; i < NumPolys; i++) {
51		ZZ_pPush push(context_vec[i % NumContexts]);
52		random(poly_vec[i], n);
53		SetCoeff(poly_vec[i], n);
	92
	93	for (long i = 0; i < NumPolys; i++) {
	94	context_vec[i % NumContexts].restore();
	95	ZZX f = Cyclotomic(n+i);
	96	conv(poly_vec[i], f);
54	97	}
	98
55	99
56	100	cerr << "START\n";
57	101
58		for (i = 0; i < NumPolys; i++)
59		thread_vec[i] = MakeSmart<thread>(task, context_vec[i % NumContexts],
60		&poly_vec[i], &res_vec[i]);
	102	BasicThreadPool pool(NumPolys);
61	103
62		for (i = 0; i < NumPolys; i++)
63		thread_vec[i]->join();
	104	pool.exec_index(NumPolys,
	105	[&](long i) {
	106	fprintf(stderr, "starting %ld: %s\n", i, CurrentThreadID().c_str());
	107	context_vec[i % NumContexts].restore();
	108	CanZass(res_vec[i], poly_vec[i]);
	109	fprintf(stderr, "stopping %ld: %s\n", i, CurrentThreadID().c_str());
	110	});
64	111
65	112	cerr << "checking results...\n";
66	113
67	114
68		for (i = 0; i < NumPolys; i++) {
69		ZZ_pPush push(context_vec[i % NumContexts]);
70		vec_pair_ZZ_pX_long v;
71		berlekamp(v, poly_vec[i]);
72		if (v.length() == res_vec[i].length() && mul(v) == mul(res_vec[i]))
	115	for (long i = 0; i < NumPolys; i++) {
	116	context_vec[i % NumContexts].restore();
	117	if (res_vec[i].length() == deg(poly_vec[i])/multOrd(ZZ_p::modulus(), n+i))
73	118	cerr << i << " GOOD\n";
74	119	else
75	120	cerr << i << " BAD\n";

-1

src/VERSION_INFO less more

0		16:0:0
	0	26:0:0

-1

src/WINDIR less more

0		WinNTL-9_3_0
	0	WinNTL-9_9_0

-4

src/Wizard less more

27	27	mkdir small/include/NTL
28	28
29	29	cp MulTimeTest.c small/src
30		cp PolyTimeTest.c small/src
31	30	cp Poly1TimeTest.c small/src
	31	cp Poly2TimeTest.c small/src
	32	cp Poly3TimeTest.c small/src
32	33	cp GF2XTimeTest.c small/src
33	34	cp InitSettings.c small/src
34	35	cp DispSettings.c small/src

54	55	cp GF2X.c small/src
55	56	cp GF2X1.c small/src
56	57	cp thread.c small/src
	58	cp BasicThreadPool.c small/src
57	59	cp fileio.c small/src
58	60
59	61
60	62
	63	sh CopyFeatures '..' small
61	64	cp ../include/NTL/FFT.h small/include/NTL
62	65	cp ../include/NTL/SPMM_ASM.h small/include/NTL
63	66	cp ../include/NTL/ctools.h small/include/NTL
64		cp ../include/NTL/have_LL.h small/include/NTL
65		cp ../include/NTL/have_builtin_clzl.h small/include/NTL
66	67	cp ../include/NTL/ZZ.h small/include/NTL
67	68	cp ../include/NTL/sp_arith.h small/include/NTL
68	69	cp ../include/NTL/ZZVec.h small/include/NTL

79	80	cp ../include/NTL/Lazy.h small/include/NTL
80	81	cp ../include/NTL/LazyTable.h small/include/NTL
81	82	cp ../include/NTL/thread.h small/include/NTL
	83	cp ../include/NTL/BasicThreadPool.h small/include/NTL
82	84	cp ../include/NTL/fileio.h small/include/NTL
83	85	cp ../include/NTL/tools.h small/include/NTL
84	86	cp ../include/NTL/vec_ZZ.h small/include/NTL

103	105
104	106	echo "*"
105	107	echo "*"
106		echo "* Updating config.h"
	108	echo "* Updating config.h and wizard_log.h"
107	109	echo "*"
108	110	echo "*"
109	111
110	112	cp small/include/NTL/config.h ../include/NTL/config.h
	113	cp small/src/wizard_log.h ../include/NTL/wizard_log.h
111	114
112	115	rm -r small
113	116

+84

-39

src/WizardAux less more

84	84	'NTL_SPMM_ASM' => 0,
85	85	'NTL_TBL_REM' => 0,
86	86	'NTL_TBL_REM_LL' => 0,
	87	'NTL_CRT_ALTCODE' => 0,
	88	'NTL_CRT_ALTCODE_SMALL'=> 0,
87	89	'NTL_AVOID_BRANCHING' => 0,
88	90	'NTL_GF2X_ALTCODE' => 0,
89	91	'NTL_GF2X_ALTCODE1' => 0,
90	92	'NTL_GF2X_NOINLINE' => 0,
	93	'NTL_PCLMUL' => 0,
91	94	'NTL_FFT_BIGTAB' => 0,
92	95	'NTL_FFT_LAZYMUL' => 0,
93	96

182	185	GenConfigHeader();
183	186	$time1 = RunProg("Poly1TimeTest");
184	187
185		if ($time11.0 > $time1.05) {
	188	if ($time11.0 > $time1.04) {
186	189	# stick with BIGTABs
187	190	$Config{"NTL_FFT_BIGTAB"} = 1;
188	191	}

195	198	unlink("lip.o");
196	199
197	200
198		if ($Config{"NTL_PCLMUL"} == 0) {
199
200		# set the flags GF2X_NOINLINE and GF2X_ALTCODE...try all pairs
201		# bit don't bother with this if PCLMUL is enabled
202
203		$time = "999999999999999";
204		$aflag = "default";
205		$bflag = "default";
206
207		foreach $aflag1 ("default", "NTL_GF2X_NOINLINE") {
208		foreach $bflag1 ("default", "NTL_GF2X_ALTCODE", "NTL_GF2X_ALTCODE1") {
209
210		$Config{$aflag1} = 1;
211		$Config{$bflag1} = 1;
212		GenConfigHeader();
213		$time1 = RunProg("GF2XTimeTest");
214
215		if ($time1 < $time) {
216		$aflag = $aflag1;
217		$bflag = $bflag1;
218		$time = $time1;
219		}
220
221		$Config{$aflag1} = 0;
222		$Config{$bflag1} = 0;
223		unlink("GF2X.o");
224		}
225		}
226
	201
	202
	203	# set flags NTL_GF2X_NOINLINE, NTL_GF2X_ALTCODE, NTL_GF2X_ALTCODE1
	204
	205	$time = "999999999999999";
	206	$aflag = "default";
	207	$bflag = "default";
	208
	209	foreach $aflag1 ("default", "NTL_GF2X_NOINLINE") {
	210	foreach $bflag1 ("default", "NTL_GF2X_ALTCODE", "NTL_GF2X_ALTCODE1") {
	211
	212	$Config{$aflag1} = 1;
	213	$Config{$bflag1} = 1;
	214	GenConfigHeader();
	215	$time1 = RunProg("GF2XTimeTest");
	216
	217	if ($time1 < $time) {
	218	$aflag = $aflag1;
	219	$bflag = $bflag1;
	220	$time = $time1;
	221	}
	222
	223	$Config{$aflag1} = 0;
	224	$Config{$bflag1} = 0;
	225	unlink("GF2X.o");
	226	}
	227	}
	228
	229
	230
	231	# now try NTL_PCLMUL instead
	232	unlink("GF2X.o");
	233	unlink("GF2X1.o");
	234	$Config{"NTL_PCLMUL"} = 1;
	235	GenConfigHeader();
	236	$time1 = RunProg("GF2XTimeTest");
	237	unlink("GF2X.o");
	238	unlink("GF2X1.o");
	239	if ($time1 >= $time) {
	240	$Config{"NTL_PCLMUL"} = 0;
227	241	$Config{$aflag} = 1;
228	242	$Config{$bflag} = 1;
229
230
231		}
232
233
	243	}
234	244
235	245	if ($Config{"NTL_GMP_LIP"} == 0) {
236	246

265	275	foreach $flag1 ("default", "NTL_TBL_REM", "NTL_TBL_REM_LL") {
266	276	$Config{$flag1} = 1;
267	277	GenConfigHeader();
268		$time1 = RunProg("PolyTimeTest");
	278	$time1 = RunProg("Poly2TimeTest");
269	279
270	280	if ($time1 < $time) {
271	281	$flag = $flag1;

288	298	foreach $flag1 ("default", "NTL_TBL_REM") {
289	299	$Config{$flag1} = 1;
290	300	GenConfigHeader();
291		$time1 = RunProg("PolyTimeTest");
	301	$time1 = RunProg("Poly2TimeTest");
292	302
293	303	if ($time1 < $time) {
294	304	$flag = $flag1;

301	311
302	312	$Config{$flag} = 1;
303	313
	314
	315	# set NTL_CRT_ALTCODE
	316
	317	$time = "999999999999999";
	318	$flag = "default";
	319
	320	foreach $flag1 ("default", "NTL_CRT_ALTCODE") {
	321	$Config{$flag1} = 1;
	322	GenConfigHeader();
	323	$time1 = RunProg("Poly3TimeTest");
	324
	325	if ($time1 < $time) {
	326	$flag = $flag1;
	327	$time = $time1;
	328	}
	329
	330	$Config{$flag1} = 0;
	331	unlink("lip.o");
	332	}
	333
	334	$Config{$flag} = 1;
	335
	336	# set NTL_CRT_ALTCODE_SMALL, if NTL_CRT_ALTCODE
	337	# not set but it did not perform too badly
	338
	339	if ($Config{"NTL_CRT_ALTCODE"} == 0) {
	340	# time measures default and time1 measures ALTCODE
	341	if (1.0$time1 < 1.15$time) {
	342	$Config{"NTL_CRT_ALTCODE_SMALL"} = 1;
	343	}
	344	}
	345
	346
	347
304	348	}
305	349
306	350	$Config{'WIZARD_HACK'} = "";

310	354
311	355	system("make DispSettings");
312	356	system("./DispSettings");
313
314
	357	system("./DispSettings > wizard_log.h");
	358
	359

-1

src/WordVector.c less more

123	123
124	124	void CopySwap(WordVector& x, WordVector& y)
125	125	{
126		NTL_THREAD_LOCAL static WordVector t;
	126	NTL_TLS_LOCAL(WordVector, t);
127	127	WordVectorWatcher watch_t(t);
128	128
129	129	long sz_x = x.length();

+674

-520

src/ZZ.c less more

3	3	#include <NTL/Lazy.h>
4	4	#include <NTL/fileio.h>
5	5
	6	#include <cstring>
	7
6	8
7	9
8	10	NTL_START_IMPL

13	15
14	16	const ZZ& ZZ::zero()
15	17	{
16		NTL_THREAD_LOCAL static ZZ z;
	18
	19	static const ZZ z; // GLOBAL (relies on C++11 thread-safe init)
17	20	return z;
18	21	}
19	22
20	23
21	24	const ZZ& ZZ_expo(long e)
22	25	{
23		NTL_THREAD_LOCAL static ZZ expo_helper;
	26	NTL_TLS_LOCAL(ZZ, expo_helper);
	27
24	28	conv(expo_helper, e);
25	29	return expo_helper;
26	30	}

53	57
54	58	// ****** input and output
55	59
56		NTL_THREAD_LOCAL static long iodigits = 0;
57		NTL_THREAD_LOCAL static long ioradix = 0;
58
	60
	61	static NTL_CHEAP_THREAD_LOCAL long iodigits = 0;
	62	static NTL_CHEAP_THREAD_LOCAL long ioradix = 0;
59	63	// iodigits is the greatest integer such that 10^{iodigits} < NTL_WSP_BOUND
60	64	// ioradix = 10^{iodigits}
61	65

172	176	static
173	177	void PrintDigits(ostream& s, long d, long justify)
174	178	{
175		NTL_THREAD_LOCAL static Vec<char> buf(INIT_SIZE, iodigits);
	179	NTL_TLS_LOCAL_INIT(Vec<char>, buf, (INIT_SIZE, iodigits));
176	180
177	181	long i = 0;
178	182

320	324	t = v1;
321	325	}
322	326
	327	long InvModStatus(long& x, long a, long n)
	328	{
	329	long d, s, t;
	330
	331	XGCD(d, s, t, a, n);
	332	if (d != 1) {
	333	x = d;
	334	return 1;
	335	}
	336	else {
	337	if (s < 0)
	338	x = s + n;
	339	else
	340	x = s;
	341
	342	return 0;
	343	}
	344	}
323	345
324	346	long InvMod(long a, long n)
325	347	{

1369	1391
1370	1392
1371	1393
1372		// RANDOM NUMBER GENERATION
1373
1374		// Idea for this PRNG. Iteratively hash seed using md5
1375		// to get 256 bytes to initialize arc4.
1376		// Then use arc4 to get a pseudo-random byte stream.
1377
1378		// I've taken care that the pseudo-random numbers generated by
1379		// the routines RandomBnd, RandomBits, and RandomLen
1380		// are completely platform independent.
1381
1382		// I make use of the md5 compression function,
1383		// which I've modified to work on 64-bit machines
1384
1385
1386		/*
1387		* BEGIN RSA's md5 stuff
1388		*
1389		*/
1390
1391		/*
1392		**********************************************************************
1393		md5.c
1394		RSA Data Security, Inc. MD5 Message Digest Algorithm
1395		Created: 2/17/90 RLR
1396		Revised: 1/91 SRD,AJ,BSK,JT Reference C Version
1397		**********************************************************************
1398		*/
1399
1400		/*
1401		**********************************************************************
1402		Copyright (C) 1990, RSA Data Security, Inc. All rights reserved.
1403
1404		License to copy and use this software is granted provided that
1405		it is identified as the "RSA Data Security, Inc. MD5 Message
1406		Digest Algorithm" in all material mentioning or referencing this
1407		software or this function.
1408
1409		License is also granted to make and use derivative works
1410		provided that such works are identified as "derived from the RSA
1411		Data Security, Inc. MD5 Message Digest Algorithm" in all
1412		material mentioning or referencing the derived work.
1413
1414		RSA Data Security, Inc. makes no representations concerning
1415		either the merchantability of this software or the suitability
1416		of this software for any particular purpose. It is provided "as
1417		is" without express or implied warranty of any kind.
1418
1419		These notices must be retained in any copies of any part of this
1420		documentation and/or software.
1421		**********************************************************************
1422		*/
1423
1424
1425		#if (NTL_BITS_PER_LONG <= 32)
1426		#define TRUNC32(x) (x)
	1394	// ======================= new PRG stuff ======================
	1395
	1396
	1397
	1398
	1399	#if (NTL_BITS_PER_INT32 == 32)
	1400	#define INT32MASK(x) (x)
1427	1401	#else
1428		#define TRUNC32(x) ((x) & ((1UL << 32)-1UL))
	1402	#define INT32MASK(x) ((x) & _ntl_uint32(0xffffffff))
1429	1403	#endif
1430	1404
1431		/* F, G and H are basic MD5 functions: selection, majority, parity */
1432		#define F(x, y, z) (((x) & (y)) \| ((~x) & (z)))
1433		#define G(x, y, z) (((x) & (z)) \| ((y) & (~z)))
1434		#define H(x, y, z) ((x) ^ (y) ^ (z))
1435		#define I(x, y, z) (TRUNC32((y) ^ ((x) \| (~z))))
1436
1437		/* ROTATE_LEFT rotates x left n bits */
1438		#define ROTATE_LEFT(x, n) (TRUNC32(((x) << (n)) \| ((x) >> (32-(n)))))
1439
1440		/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
1441		/* Rotation is separate from addition to prevent recomputation */
1442		#define FF(a, b, c, d, x, s, ac) \
1443		{(a) = TRUNC32((a) + F((b), (c), (d)) + (x) + (ac)); \
1444		(a) = ROTATE_LEFT((a), (s)); \
1445		(a) = TRUNC32((a) + (b)); \
1446		}
1447		#define GG(a, b, c, d, x, s, ac) \
1448		{(a) = TRUNC32((a) + G((b), (c), (d)) + (x) + (ac)); \
1449		(a) = ROTATE_LEFT((a), (s)); \
1450		(a) = TRUNC32((a) + (b)); \
1451		}
1452		#define HH(a, b, c, d, x, s, ac) \
1453		{(a) = TRUNC32((a) + H((b), (c), (d)) + (x) + (ac)); \
1454		(a) = ROTATE_LEFT((a), (s)); \
1455		(a) = TRUNC32((a) + (b)); \
1456		}
1457		#define II(a, b, c, d, x, s, ac) \
1458		{(a) = TRUNC32((a) + I((b), (c), (d)) + (x) + (ac)); \
1459		(a) = ROTATE_LEFT((a), (s)); \
1460		(a) = TRUNC32((a) + (b)); \
1461		}
1462
	1405
	1406
	1407	// SHA256 code adapted from an implementauin by Brad Conte.
	1408	// The following is from his original source files.
	1409	/*********************************************************************
	1410	* Filename: sha256.c
	1411	* Author: Brad Conte (brad AT bradconte.com)
	1412	* Copyright:
	1413	* Disclaimer: This code is presented "as is" without any guarantees.
	1414	* Details: Implementation of the SHA-256 hashing algorithm.
	1415	SHA-256 is one of the three algorithms in the SHA2
	1416	specification. The others, SHA-384 and SHA-512, are not
	1417	offered in this implementation.
	1418	Algorithm specification can be found here:
	1419	* http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf
	1420	This implementation uses little endian byte order.
	1421	*********************************************************************/
	1422
	1423
	1424
	1425
	1426	#define SHA256_BLOCKSIZE (64)
	1427	#define SHA256_HASHSIZE (32)
	1428
	1429	// DBL_INT_ADD treats two unsigned ints a and b as one 64-bit integer and adds c to it
	1430	static inline
	1431	void DBL_INT_ADD(_ntl_uint32& a, _ntl_uint32& b, _ntl_uint32 c)
	1432	{
	1433	_ntl_uint32 aa = INT32MASK(a);
	1434	if (aa > INT32MASK(_ntl_uint32(0xffffffff) - c)) b++;
	1435	a = aa + c;
	1436	}
	1437
	1438	#define ROTLEFT(a,b) (((a) << (b)) \| (INT32MASK(a) >> (32-(b))))
	1439	#define ROTRIGHT(a,b) ((INT32MASK(a) >> (b)) \| ((a) << (32-(b))))
	1440
	1441	#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
	1442	#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
	1443	#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
	1444	#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
	1445	#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (INT32MASK(x) >> 3))
	1446	#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (INT32MASK(x) >> 10))
	1447
	1448	struct SHA256_CTX {
	1449	unsigned char data[64];
	1450	_ntl_uint32 datalen;
	1451	_ntl_uint32 bitlen[2];
	1452	_ntl_uint32 state[8];
	1453	};
	1454
	1455	static const _ntl_uint32 sha256_const[64] = {
	1456	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
	1457	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
	1458	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
	1459	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
	1460	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
	1461	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
	1462	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
	1463	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
	1464	};
1463	1465
1464	1466
1465	1467	static
1466		void MD5_default_IV(unsigned long *buf)
1467		{
1468		buf[0] = 0x67452301UL;
1469		buf[1] = 0xefcdab89UL;
1470		buf[2] = 0x98badcfeUL;
1471		buf[3] = 0x10325476UL;
1472		}
1473
1474
1475
1476		/* Basic MD5 step. Transform buf based on in.
1477		*/
	1468	void sha256_transform(SHA256_CTX& ctx, unsigned char *data)
	1469	{
	1470	_ntl_uint32 a,b,c,d,e,f,g,h,i,j,t1,t2,m[64];
	1471
	1472	for (i=0,j=0; i < 16; ++i, j += 4)
	1473	m[i] = (data[j] << 24) \| (data[j+1] << 16) \| (data[j+2] << 8) \| (data[j+3]);
	1474	for ( ; i < 64; ++i)
	1475	m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
	1476
	1477	a = ctx.state[0];
	1478	b = ctx.state[1];
	1479	c = ctx.state[2];
	1480	d = ctx.state[3];
	1481	e = ctx.state[4];
	1482	f = ctx.state[5];
	1483	g = ctx.state[6];
	1484	h = ctx.state[7];
	1485
	1486	for (i = 0; i < 64; ++i) {
	1487	t1 = h + EP1(e) + CH(e,f,g) + sha256_const[i] + m[i];
	1488	t2 = EP0(a) + MAJ(a,b,c);
	1489	h = g;
	1490	g = f;
	1491	f = e;
	1492	e = d + t1;
	1493	d = c;
	1494	c = b;
	1495	b = a;
	1496	a = t1 + t2;
	1497	}
	1498
	1499	ctx.state[0] += a;
	1500	ctx.state[1] += b;
	1501	ctx.state[2] += c;
	1502	ctx.state[3] += d;
	1503	ctx.state[4] += e;
	1504	ctx.state[5] += f;
	1505	ctx.state[6] += g;
	1506	ctx.state[7] += h;
	1507	}
1478	1508
1479	1509	static
1480		void MD5_compress(unsigned long buf, unsigned long in)
1481		{
1482		unsigned long a = buf[0], b = buf[1], c = buf[2], d = buf[3];
1483
1484		/* Round 1 */
1485		#define S11 7
1486		#define S12 12
1487		#define S13 17
1488		#define S14 22
1489		FF ( a, b, c, d, in[ 0], S11, 3614090360UL); /* 1 */
1490		FF ( d, a, b, c, in[ 1], S12, 3905402710UL); /* 2 */
1491		FF ( c, d, a, b, in[ 2], S13, 606105819UL); /* 3 */
1492		FF ( b, c, d, a, in[ 3], S14, 3250441966UL); /* 4 */
1493		FF ( a, b, c, d, in[ 4], S11, 4118548399UL); /* 5 */
1494		FF ( d, a, b, c, in[ 5], S12, 1200080426UL); /* 6 */
1495		FF ( c, d, a, b, in[ 6], S13, 2821735955UL); /* 7 */
1496		FF ( b, c, d, a, in[ 7], S14, 4249261313UL); /* 8 */
1497		FF ( a, b, c, d, in[ 8], S11, 1770035416UL); /* 9 */
1498		FF ( d, a, b, c, in[ 9], S12, 2336552879UL); /* 10 */
1499		FF ( c, d, a, b, in[10], S13, 4294925233UL); /* 11 */
1500		FF ( b, c, d, a, in[11], S14, 2304563134UL); /* 12 */
1501		FF ( a, b, c, d, in[12], S11, 1804603682UL); /* 13 */
1502		FF ( d, a, b, c, in[13], S12, 4254626195UL); /* 14 */
1503		FF ( c, d, a, b, in[14], S13, 2792965006UL); /* 15 */
1504		FF ( b, c, d, a, in[15], S14, 1236535329UL); /* 16 */
1505
1506		/* Round 2 */
1507		#define S21 5
1508		#define S22 9
1509		#define S23 14
1510		#define S24 20
1511		GG ( a, b, c, d, in[ 1], S21, 4129170786UL); /* 17 */
1512		GG ( d, a, b, c, in[ 6], S22, 3225465664UL); /* 18 */
1513		GG ( c, d, a, b, in[11], S23, 643717713UL); /* 19 */
1514		GG ( b, c, d, a, in[ 0], S24, 3921069994UL); /* 20 */
1515		GG ( a, b, c, d, in[ 5], S21, 3593408605UL); /* 21 */
1516		GG ( d, a, b, c, in[10], S22, 38016083UL); /* 22 */
1517		GG ( c, d, a, b, in[15], S23, 3634488961UL); /* 23 */
1518		GG ( b, c, d, a, in[ 4], S24, 3889429448UL); /* 24 */
1519		GG ( a, b, c, d, in[ 9], S21, 568446438UL); /* 25 */
1520		GG ( d, a, b, c, in[14], S22, 3275163606UL); /* 26 */
1521		GG ( c, d, a, b, in[ 3], S23, 4107603335UL); /* 27 */
1522		GG ( b, c, d, a, in[ 8], S24, 1163531501UL); /* 28 */
1523		GG ( a, b, c, d, in[13], S21, 2850285829UL); /* 29 */
1524		GG ( d, a, b, c, in[ 2], S22, 4243563512UL); /* 30 */
1525		GG ( c, d, a, b, in[ 7], S23, 1735328473UL); /* 31 */
1526		GG ( b, c, d, a, in[12], S24, 2368359562UL); /* 32 */
1527
1528		/* Round 3 */
1529		#define S31 4
1530		#define S32 11
1531		#define S33 16
1532		#define S34 23
1533		HH ( a, b, c, d, in[ 5], S31, 4294588738UL); /* 33 */
1534		HH ( d, a, b, c, in[ 8], S32, 2272392833UL); /* 34 */
1535		HH ( c, d, a, b, in[11], S33, 1839030562UL); /* 35 */
1536		HH ( b, c, d, a, in[14], S34, 4259657740UL); /* 36 */
1537		HH ( a, b, c, d, in[ 1], S31, 2763975236UL); /* 37 */
1538		HH ( d, a, b, c, in[ 4], S32, 1272893353UL); /* 38 */
1539		HH ( c, d, a, b, in[ 7], S33, 4139469664UL); /* 39 */
1540		HH ( b, c, d, a, in[10], S34, 3200236656UL); /* 40 */
1541		HH ( a, b, c, d, in[13], S31, 681279174UL); /* 41 */
1542		HH ( d, a, b, c, in[ 0], S32, 3936430074UL); /* 42 */
1543		HH ( c, d, a, b, in[ 3], S33, 3572445317UL); /* 43 */
1544		HH ( b, c, d, a, in[ 6], S34, 76029189UL); /* 44 */
1545		HH ( a, b, c, d, in[ 9], S31, 3654602809UL); /* 45 */
1546		HH ( d, a, b, c, in[12], S32, 3873151461UL); /* 46 */
1547		HH ( c, d, a, b, in[15], S33, 530742520UL); /* 47 */
1548		HH ( b, c, d, a, in[ 2], S34, 3299628645UL); /* 48 */
1549
1550		/* Round 4 */
1551		#define S41 6
1552		#define S42 10
1553		#define S43 15
1554		#define S44 21
1555		II ( a, b, c, d, in[ 0], S41, 4096336452UL); /* 49 */
1556		II ( d, a, b, c, in[ 7], S42, 1126891415UL); /* 50 */
1557		II ( c, d, a, b, in[14], S43, 2878612391UL); /* 51 */
1558		II ( b, c, d, a, in[ 5], S44, 4237533241UL); /* 52 */
1559		II ( a, b, c, d, in[12], S41, 1700485571UL); /* 53 */
1560		II ( d, a, b, c, in[ 3], S42, 2399980690UL); /* 54 */
1561		II ( c, d, a, b, in[10], S43, 4293915773UL); /* 55 */
1562		II ( b, c, d, a, in[ 1], S44, 2240044497UL); /* 56 */
1563		II ( a, b, c, d, in[ 8], S41, 1873313359UL); /* 57 */
1564		II ( d, a, b, c, in[15], S42, 4264355552UL); /* 58 */
1565		II ( c, d, a, b, in[ 6], S43, 2734768916UL); /* 59 */
1566		II ( b, c, d, a, in[13], S44, 1309151649UL); /* 60 */
1567		II ( a, b, c, d, in[ 4], S41, 4149444226UL); /* 61 */
1568		II ( d, a, b, c, in[11], S42, 3174756917UL); /* 62 */
1569		II ( c, d, a, b, in[ 2], S43, 718787259UL); /* 63 */
1570		II ( b, c, d, a, in[ 9], S44, 3951481745UL); /* 64 */
1571
1572		buf[0] = TRUNC32(buf[0] + a);
1573		buf[1] = TRUNC32(buf[1] + b);
1574		buf[2] = TRUNC32(buf[2] + c);
1575		buf[3] = TRUNC32(buf[3] + d);
1576		}
1577
1578
1579		/*
1580		* END RSA's md5 stuff
1581		*
1582		*/
1583
	1510	void sha256_init(SHA256_CTX& ctx)
	1511	{
	1512	ctx.datalen = 0;
	1513	ctx.bitlen[0] = 0;
	1514	ctx.bitlen[1] = 0;
	1515	ctx.state[0] = 0x6a09e667;
	1516	ctx.state[1] = 0xbb67ae85;
	1517	ctx.state[2] = 0x3c6ef372;
	1518	ctx.state[3] = 0xa54ff53a;
	1519	ctx.state[4] = 0x510e527f;
	1520	ctx.state[5] = 0x9b05688c;
	1521	ctx.state[6] = 0x1f83d9ab;
	1522	ctx.state[7] = 0x5be0cd19;
	1523	}
1584	1524
1585	1525	static
1586		void words_from_bytes(unsigned long txtl, const unsigned char txtc, long n)
1587		{
	1526	void sha256_update(SHA256_CTX& ctx, const unsigned char *data, _ntl_uint32 len)
	1527	{
	1528	_ntl_uint32 i;
	1529
	1530	for (i=0; i < len; ++i) {
	1531	ctx.data[ctx.datalen] = data[i];
	1532	ctx.datalen++;
	1533	if (ctx.datalen == 64) {
	1534	sha256_transform(ctx,ctx.data);
	1535	DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],512);
	1536	ctx.datalen = 0;
	1537	}
	1538	}
	1539	}
	1540
	1541	static
	1542	void sha256_final(SHA256_CTX& ctx, unsigned char *hash,
	1543	long hlen=SHA256_HASHSIZE)
	1544	{
	1545	_ntl_uint32 i, j;
	1546
	1547	i = ctx.datalen;
	1548
	1549	// Pad whatever data is left in the buffer.
	1550	if (ctx.datalen < 56) {
	1551	ctx.data[i++] = 0x80;
	1552	while (i < 56)
	1553	ctx.data[i++] = 0x00;
	1554	}
	1555	else {
	1556	ctx.data[i++] = 0x80;
	1557	while (i < 64)
	1558	ctx.data[i++] = 0x00;
	1559	sha256_transform(ctx,ctx.data);
	1560	memset(ctx.data,0,56);
	1561	}
	1562
	1563	// Append to the padding the total message's length in bits and transform.
	1564	DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],ctx.datalen * 8);
	1565
	1566	ctx.data[63] = ctx.bitlen[0];
	1567	ctx.data[62] = ctx.bitlen[0] >> 8;
	1568	ctx.data[61] = ctx.bitlen[0] >> 16;
	1569	ctx.data[60] = ctx.bitlen[0] >> 24;
	1570	ctx.data[59] = ctx.bitlen[1];
	1571	ctx.data[58] = ctx.bitlen[1] >> 8;
	1572	ctx.data[57] = ctx.bitlen[1] >> 16;
	1573	ctx.data[56] = ctx.bitlen[1] >> 24;
	1574	sha256_transform(ctx,ctx.data);
	1575
	1576	for (i = 0; i < 8; i++) {
	1577	_ntl_uint32 w = ctx.state[i];
	1578	for (j = 0; j < 4; j++) {
	1579	if (hlen <= 0) break;
	1580	hash[4i + j] = w >> (24-j8);
	1581	hlen--;
	1582	}
	1583	}
	1584
	1585	}
	1586
	1587
	1588
	1589	static
	1590	void sha256(const unsigned char data, long dlen, unsigned char hash,
	1591	long hlen=SHA256_HASHSIZE)
	1592	{
	1593	if (dlen < 0) dlen = 0;
	1594	if (hlen < 0) hlen = 0;
	1595
	1596	SHA256_CTX ctx;
	1597	sha256_init(ctx);
	1598
	1599	const long BLKSIZE = 4096;
	1600
1588	1601	long i;
1589		unsigned long v;
1590
1591		for (i = 0; i < n; i++) {
1592		v = txtc[4*i];
1593		v += ((unsigned long) (txtc[4*i+1])) << 8;
1594		v += ((unsigned long) (txtc[4*i+2])) << 16;
1595		v += ((unsigned long) (txtc[4*i+3])) << 24;
1596		txtl[i] = v;
1597		}
1598		}
1599
1600		static
1601		void bytes_from_words(unsigned char txtc, const unsigned long txtl, long n)
1602		{
	1602	for (i = 0; i <= dlen-BLKSIZE; i += BLKSIZE)
	1603	sha256_update(ctx, data + i, BLKSIZE);
	1604
	1605	if (i < dlen)
	1606	sha256_update(ctx, data + i, dlen - i);
	1607
	1608	sha256_final(ctx, hash, hlen);
	1609	}
	1610
	1611
	1612	static
	1613	void hmac_sha256(const unsigned char *key, long klen,
	1614	const unsigned char *data, long dlen,
	1615	unsigned char *hash, long hlen=SHA256_HASHSIZE)
	1616	{
	1617	if (klen < 0) klen = 0;
	1618	if (dlen < 0) dlen = 0;
	1619	if (hlen < 0) hlen = 0;
	1620
	1621	unsigned char K[SHA256_BLOCKSIZE];
	1622	unsigned char tmp[SHA256_HASHSIZE];
	1623
1603	1624	long i;
1604		unsigned long v;
1605
1606		for (i = 0; i < n; i++) {
1607		v = txtl[i];
1608		txtc[4*i] = v & 255;
1609		v = v >> 8;
1610		txtc[4*i+1] = v & 255;
1611		v = v >> 8;
1612		txtc[4*i+2] = v & 255;
1613		v = v >> 8;
1614		txtc[4*i+3] = v & 255;
1615		}
1616		}
	1625
	1626	if (klen <= SHA256_BLOCKSIZE) {
	1627	for (i = 0; i < klen; i++)
	1628	K[i] = key[i];
	1629	for (i = klen; i < SHA256_BLOCKSIZE; i++)
	1630	K[i] = 0;
	1631	}
	1632	else {
	1633	sha256(key, klen, K, SHA256_BLOCKSIZE);
	1634	for (i = SHA256_HASHSIZE; i < SHA256_BLOCKSIZE; i++)
	1635	K[i] = 0;
	1636	}
	1637
	1638	for (i = 0; i < SHA256_BLOCKSIZE; i++)
	1639	K[i] ^= 0x36;
	1640
	1641	SHA256_CTX ctx;
	1642	sha256_init(ctx);
	1643	sha256_update(ctx, K, SHA256_BLOCKSIZE);
	1644	sha256_update(ctx, data, dlen);
	1645	sha256_final(ctx, tmp);
	1646
	1647	for (i = 0; i < SHA256_BLOCKSIZE; i++)
	1648	K[i] ^= (0x36 ^ 0x5C);
	1649
	1650	sha256_init(ctx);
	1651	sha256_update(ctx, K, SHA256_BLOCKSIZE);
	1652	sha256_update(ctx, tmp, SHA256_HASHSIZE);
	1653	sha256_final(ctx, hash, hlen);
	1654	}
	1655
	1656
	1657	// This key derivation uses HMAC with a zero key to derive
	1658	// an intermediate key K from the data, and then uses HMAC
	1659	// as a PRF in counter mode with key K to derive the final key
	1660
	1661	void DeriveKey(unsigned char *key, long klen,
	1662	const unsigned char *data, long dlen)
	1663	{
	1664	if (dlen < 0) LogicError("DeriveKey: bad args");
	1665	if (klen < 0) LogicError("DeriveKey: bad args");
	1666
	1667	long i, j;
	1668
	1669
	1670	unsigned char K[SHA256_HASHSIZE];
	1671	hmac_sha256(0, 0, data, dlen, K);
	1672
	1673	// initialize 64-bit counter to zero
	1674	unsigned char counter[8];
	1675	for (j = 0; j < 8; j++) counter[j] = 0;
	1676
	1677	for (i = 0; i <= klen-SHA256_HASHSIZE; i += SHA256_HASHSIZE) {
	1678	hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i);
	1679
	1680	// increment counter
	1681	for (j = 0; j < 8; j++) {
	1682	counter[j]++;
	1683	if (counter[j] != 0) break;
	1684	}
	1685	}
	1686
	1687	if (i < klen)
	1688	hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i, klen-i);
	1689	}
	1690
	1691
	1692
	1693
	1694	// ****************** ChaCha20 stuff *********************
	1695
	1696	static const _ntl_uint32 chacha_const[4] =
	1697	{ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 };
	1698
	1699
	1700	#define LE(p) (((_ntl_uint32)((p)[0])) + ((_ntl_uint32)((p)[1]) << 8) + \
	1701	((_ntl_uint32)((p)[2]) << 16) + ((_ntl_uint32)((p)[3]) << 24))
	1702
	1703	#define FROMLE(p, x) (p)[0] = (x), (p)[1] = ((x) >> 8), \
	1704	(p)[2] = ((x) >> 16), (p)[3] = ((x) >> 24)
	1705
	1706
	1707	#define QUARTERROUND(x, a, b, c, d) \
	1708	x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 16), \
	1709	x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 12), \
	1710	x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 8), \
	1711	x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 7)
1617	1712
1618	1713
1619	1714	static
1620		void MD5_compress1(unsigned long buf, unsigned char in, long n)
1621		{
1622		unsigned long txtl[16];
1623		unsigned char txtc[64];
1624		long i, j, k;
1625
1626		if (n < 0) n = 0;
1627
1628		i = 0;
1629		while (i < n) {
1630		k = n-i;
1631		if (k > 64) k = 64;
1632		for (j = 0; j < k; j++)
1633		txtc[j] = in[i+j];
1634		for (; j < 64; j++)
1635		txtc[j] = 0;
1636		words_from_bytes(txtl, txtc, 16);
1637		MD5_compress(buf, txtl);
1638		i += k;
1639		}
1640		}
1641
1642
1643		// the "cipherpunk" version of arc4
1644
1645		struct _ZZ_arc4_key
1646		{
1647		unsigned char state[256];
1648		unsigned char x;
1649		unsigned char y;
1650		};
1651
	1715	void salsa20_core(_ntl_uint32* data)
	1716	{
	1717	long i;
	1718
	1719	for (i = 0; i < 10; i++) {
	1720	QUARTERROUND(data, 0, 4, 8, 12);
	1721	QUARTERROUND(data, 1, 5, 9, 13);
	1722	QUARTERROUND(data, 2, 6, 10, 14);
	1723	QUARTERROUND(data, 3, 7, 11, 15);
	1724	QUARTERROUND(data, 0, 5, 10, 15);
	1725	QUARTERROUND(data, 1, 6, 11, 12);
	1726	QUARTERROUND(data, 2, 7, 8, 13);
	1727	QUARTERROUND(data, 3, 4, 9, 14);
	1728	}
	1729	}
	1730
	1731
	1732	// key K must be exactly 32 bytes
	1733	static
	1734	void salsa20_init(_ntl_uint32 state, const unsigned char K)
	1735	{
	1736	long i;
	1737
	1738	for (i = 0; i < 4; i++)
	1739	state[i] = chacha_const[i];
	1740
	1741	for (i = 4; i < 12; i++)
	1742	state[i] = LE(K + 4*(i-4));
	1743
	1744	for (i = 12; i < 16; i++)
	1745	state[i] = 0;
	1746	}
	1747
	1748
	1749
	1750	// state and data are of length 16
	1751	static
	1752	void salsa20_apply(_ntl_uint32 state, _ntl_uint32 data)
	1753	{
	1754	long i;
	1755
	1756	for (i = 0; i < 16; i++) data[i] = state[i];
	1757
	1758	salsa20_core(data);
	1759
	1760	for (i = 0; i < 16; i++) data[i] += state[i];
	1761
	1762	for (i = 12; i < 16; i++) {
	1763	state[i]++;
	1764	state[i] = INT32MASK(state[i]);
	1765	if (state[i] != 0) break;
	1766	}
	1767	}
	1768
	1769
	1770	#if 0
	1771	// state is 16 words, data is 64 bytes
	1772	static
	1773	void salsa20_apply(_ntl_uint32 state, unsigned char data)
	1774	{
	1775	_ntl_uint32 wdata[16];
	1776	salsa20_apply(state, wdata);
	1777
	1778	long i;
	1779	for (i = 0; i < 16; i++)
	1780	FROMLE(data + 4*i, wdata[i]);
	1781
	1782	// FIXME: could use memcpy for above if everything
	1783	// is right
	1784	}
	1785	#endif
	1786
	1787
	1788
	1789	RandomStream::RandomStream(const unsigned char *key)
	1790	{
	1791	salsa20_init(state, key);
	1792	pos = 64;
	1793	}
	1794
	1795
	1796	void RandomStream::do_get(unsigned char *NTL_RESTRICT res, long n)
	1797	{
	1798	if (n < 0) LogicError("RandomStream::get: bad args");
	1799
	1800	long i, j;
	1801
	1802	if (n <= 64-pos) {
	1803	for (i = 0; i < n; i++) res[i] = buf[pos+i];
	1804	pos += n;
	1805	return;
	1806	}
	1807
	1808	// read remainder of buffer
	1809	for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
	1810	n -= 64-pos;
	1811	res += 64-pos;
	1812	pos = 64;
	1813
	1814	_ntl_uint32 wdata[16];
	1815
	1816	// read 64-byte chunks
	1817	for (i = 0; i <= n-64; i += 64) {
	1818	salsa20_apply(state, wdata);
	1819	for (j = 0; j < 16; j++)
	1820	FROMLE(res + i + 4*j, wdata[j]);
	1821	}
	1822
	1823	if (i < n) {
	1824	salsa20_apply(state, wdata);
	1825
	1826	for (j = 0; j < 16; j++)
	1827	FROMLE(buf + 4*j, wdata[j]);
	1828
	1829	pos = n-i;
	1830	for (j = 0; j < pos; j++)
	1831	res[i+j] = buf[j];
	1832	}
	1833	}
	1834
	1835
	1836	NTL_TLS_GLOBAL_DECL(UniquePtr<RandomStream>, CurrentRandomStream);
	1837
	1838
	1839	void SetSeed(const RandomStream& s)
	1840	{
	1841	NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
	1842
	1843	if (!CurrentRandomStream)
	1844	CurrentRandomStream.make(s);
	1845	else
	1846	*CurrentRandomStream = s;
	1847	}
	1848
	1849
	1850	void SetSeed(const unsigned char *data, long dlen)
	1851	{
	1852	if (dlen < 0) LogicError("SetSeed: bad args");
	1853
	1854	Vec<unsigned char> key;
	1855	key.SetLength(NTL_PRG_KEYLEN);
	1856	DeriveKey(key.elts(), NTL_PRG_KEYLEN, data, dlen);
	1857
	1858	SetSeed(RandomStream(key.elts()));
	1859	}
	1860
	1861	void SetSeed(const ZZ& seed)
	1862	{
	1863	long nb = NumBytes(seed);
	1864
	1865	Vec<unsigned char> buf;
	1866	buf.SetLength(nb);
	1867
	1868	BytesFromZZ(buf.elts(), seed, nb);
	1869
	1870	SetSeed(buf.elts(), nb);
	1871	}
	1872
	1873
	1874	static
	1875	void InitRandomStream()
	1876	{
	1877	const string& id = UniqueID();
	1878	SetSeed((const unsigned char *) id.c_str(), id.length());
	1879	}
1652	1880
1653	1881	static inline
1654		void swap_byte(unsigned char a, unsigned char b)
1655		{
1656		unsigned char swapByte;
1657
1658		swapByte = *a;
1659		a = b;
1660		*b = swapByte;
1661		}
1662
1663		static
1664		void prepare_key(unsigned char *key_data_ptr,
1665		long key_data_len, _ZZ_arc4_key *key)
1666		{
1667		unsigned char index1;
1668		unsigned char index2;
1669		unsigned char* state;
1670		long counter;
1671
1672		state = &key->state[0];
1673		for(counter = 0; counter < 256; counter++)
1674		state[counter] = counter;
1675		key->x = 0;
1676		key->y = 0;
1677		index1 = 0;
1678		index2 = 0;
1679		for(counter = 0; counter < 256; counter++)
1680		{
1681		index2 = (key_data_ptr[index1] + state[counter] + index2) & 255;
1682		swap_byte(&state[counter], &state[index2]);
1683
1684		index1 = (index1 + 1) % key_data_len;
1685		}
1686		}
1687
1688
1689
1690		static
1691		void arc4(unsigned char buffer_ptr, long buffer_len, _ZZ_arc4_key key)
1692		{
1693		unsigned char x;
1694		unsigned char y;
1695		unsigned char* state;
1696		unsigned char xorIndex;
1697		long counter;
1698
1699		x = key->x;
1700		y = key->y;
1701
1702		state = &key->state[0];
1703		for(counter = 0; counter < buffer_len; counter ++)
1704		{
1705		x = (x + 1) & 255;
1706		y = (state[x] + y) & 255;
1707		swap_byte(&state[x], &state[y]);
1708
1709		xorIndex = (state[x] + state[y]) & 255;
1710
1711		buffer_ptr[counter] = state[xorIndex];
1712		}
1713		key->x = x;
1714		key->y = y;
1715		}
1716
1717		// global state information for PRNG
1718
1719		NTL_THREAD_LOCAL static long ran_initialized = 0;
1720		NTL_THREAD_LOCAL static _ZZ_arc4_key ran_key;
1721
1722		static const unsigned long default_md5_tab[16] = {
1723		744663023UL, 1011602954UL, 3163087192UL, 3383838527UL,
1724		3305324122UL, 3197458079UL, 2266495600UL, 2760303563UL,
1725		346234297UL, 1919920720UL, 1896169861UL, 2192176675UL,
1726		2027150322UL, 2090160759UL, 2134858730UL, 1131796244UL
1727		};
1728
1729
1730
1731		static
1732		void build_arc4_tab(unsigned char *seed_bytes, const ZZ& s)
1733		{
1734		long nb = NumBytes(s);
1735
1736		unsigned char *txt;
1737
1738		Vec<unsigned char> txt_storage;
1739		txt_storage.SetLength(nb + 68);
1740		txt = txt_storage.elts();
1741
1742		BytesFromZZ(txt + 4, s, nb);
1743
1744		bytes_from_words(txt + nb + 4, default_md5_tab, 16);
1745
1746		unsigned long buf[4];
1747
1748		unsigned long i;
1749		for (i = 0; i < 16; i++) {
1750		MD5_default_IV(buf);
1751		bytes_from_words(txt, &i, 1);
1752
1753		MD5_compress1(buf, txt, nb + 68);
1754
1755		bytes_from_words(seed_bytes + 16*i, buf, 4);
1756		}
1757		}
1758
1759
1760		void SetSeed(const ZZ& s)
1761		{
1762		unsigned char seed_bytes[256];
1763
1764		build_arc4_tab(seed_bytes, s);
1765		prepare_key(seed_bytes, 256, &ran_key);
1766
1767		// we discard the first 1024 bytes of the arc4 stream, as this is
1768		// recommended practice.
1769
1770		arc4(seed_bytes, 256, &ran_key);
1771		arc4(seed_bytes, 256, &ran_key);
1772		arc4(seed_bytes, 256, &ran_key);
1773		arc4(seed_bytes, 256, &ran_key);
1774
1775		ran_initialized = 1;
1776		}
1777
1778
1779		static
1780		void ran_bytes(unsigned char *bytes, long n)
1781		{
1782		if (!ran_initialized) {
1783		ZZ x;
1784		const string& id = UniqueID();
1785
1786		ZZFromBytes(x, (const unsigned char *) id.c_str(), id.length());
1787		// DIRT: slightly dirty cast from char * to unsigned char *,
1788
1789		SetSeed(x);
1790		}
1791		arc4(bytes, n, &ran_key);
	1882	RandomStream& LocalGetCurrentRandomStream()
	1883	{
	1884	NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
	1885
	1886	if (!CurrentRandomStream) InitRandomStream();
	1887	return *CurrentRandomStream;
	1888	}
	1889
	1890	RandomStream& GetCurrentRandomStream()
	1891	{
	1892	return LocalGetCurrentRandomStream();
	1893	}
	1894
	1895
	1896
	1897
	1898
	1899
	1900
	1901	static inline
	1902	unsigned long WordFromBytes(const unsigned char *buf, long n)
	1903	{
	1904	unsigned long res = 0;
	1905	long i;
	1906
	1907	for (i = n-1; i >= 0; i--)
	1908	res = (res << 8) \| buf[i];
	1909
	1910	return res;
1792	1911	}
1793	1912
1794	1913
1795	1914	unsigned long RandomWord()
1796	1915	{
	1916	RandomStream& stream = LocalGetCurrentRandomStream();
1797	1917	unsigned char buf[NTL_BITS_PER_LONG/8];
1798		long i;
1799		unsigned long res;
1800
1801		ran_bytes(buf, NTL_BITS_PER_LONG/8);
1802
1803		res = 0;
1804		for (i = NTL_BITS_PER_LONG/8 - 1; i >= 0; i--) {
1805		res = res << 8;
1806		res = res \| buf[i];
1807		}
1808
1809		return res;
	1918
	1919	stream.get(buf, NTL_BITS_PER_LONG/8);
	1920	return WordFromBytes(buf, NTL_BITS_PER_LONG/8);
1810	1921	}
1811	1922
1812	1923	long RandomBits_long(long l)

1815	1926	if (l >= NTL_BITS_PER_LONG)
1816	1927	ResourceError("RandomBits: length too big");
1817	1928
	1929	RandomStream& stream = LocalGetCurrentRandomStream();
1818	1930	unsigned char buf[NTL_BITS_PER_LONG/8];
1819		unsigned long res;
1820		long i;
1821
1822	1931	long nb = (l+7)/8;
1823		ran_bytes(buf, nb);
1824
1825		res = 0;
1826		for (i = nb - 1; i >= 0; i--) {
1827		res = res << 8;
1828		res = res \| buf[i];
1829		}
1830
1831		return long(res & ((1UL << l)-1UL));
	1932	stream.get(buf, nb);
	1933
	1934	return long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
1832	1935	}
1833	1936
1834	1937	unsigned long RandomBits_ulong(long l)

1837	1940	if (l > NTL_BITS_PER_LONG)
1838	1941	ResourceError("RandomBits: length too big");
1839	1942
	1943	RandomStream& stream = LocalGetCurrentRandomStream();
1840	1944	unsigned char buf[NTL_BITS_PER_LONG/8];
1841		unsigned long res;
1842		long i;
1843
1844	1945	long nb = (l+7)/8;
1845		ran_bytes(buf, nb);
1846
1847		res = 0;
1848		for (i = nb - 1; i >= 0; i--) {
1849		res = res << 8;
1850		res = res \| buf[i];
1851		}
1852
	1946	stream.get(buf, nb);
	1947	unsigned long res = WordFromBytes(buf, nb);
1853	1948	if (l < NTL_BITS_PER_LONG)
1854	1949	res = res & ((1UL << l)-1UL);
1855
1856	1950	return res;
1857	1951	}
1858	1952

1863	1957	if (l >= NTL_BITS_PER_LONG)
1864	1958	ResourceError("RandomLen: length too big");
1865	1959
1866		return RandomBits_long(l-1) + (1L << (l-1));
1867		}
	1960	RandomStream& stream = LocalGetCurrentRandomStream();
	1961	unsigned char buf[NTL_BITS_PER_LONG/8];
	1962	long nb = ((l-1)+7)/8;
	1963	stream.get(buf, nb);
	1964	unsigned long res = WordFromBytes(buf, nb);
	1965	unsigned long mask = (1UL << (l-1)) - 1UL;
	1966	return long((res & mask) \| (mask+1UL));
	1967	}
	1968
	1969
	1970	long RandomBnd(long bnd)
	1971	{
	1972	if (bnd <= 1) return 0;
	1973
	1974	RandomStream& stream = LocalGetCurrentRandomStream();
	1975	unsigned char buf[NTL_BITS_PER_LONG/8];
	1976	long l = NumBits(bnd-1);
	1977	long nb = (l+7)/8;
	1978
	1979	long tmp;
	1980	do {
	1981	stream.get(buf, nb);
	1982	tmp = long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
	1983	} while (tmp >= bnd);
	1984
	1985	return tmp;
	1986	}
	1987
1868	1988
1869	1989
1870	1990	void RandomBits(ZZ& x, long l)

1877	1997	if (NTL_OVERFLOW(l, 1, 0))
1878	1998	ResourceError("RandomBits: length too big");
1879	1999
	2000	RandomStream& stream = LocalGetCurrentRandomStream();
	2001
1880	2002	long nb = (l+7)/8;
1881
1882		NTL_THREAD_LOCAL static Vec<unsigned char> buf_mem;
	2003	unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
	2004
	2005	NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
1883	2006	Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
1884	2007
1885	2008	buf_mem.SetLength(nb);
1886	2009	unsigned char *buf = buf_mem.elts();
1887	2010
1888		ran_bytes(buf, nb);
1889
1890		NTL_ZZRegister(res);
1891
1892		ZZFromBytes(res, buf, nb);
1893		trunc(res, res, l);
1894
1895		x = res;
	2011	x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
	2012	// pre-allocate to ensure strong ES
	2013
	2014	stream.get(buf, nb);
	2015	buf[nb-1] &= mask;
	2016
	2017	ZZFromBytes(x, buf, nb);
1896	2018	}
1897	2019
1898	2020

1911	2033	if (NTL_OVERFLOW(l, 1, 0))
1912	2034	ResourceError("RandomLen: length too big");
1913	2035
1914		// pre-allocate space to avoid two allocations
1915		long nw = (l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS;
1916		x.SetSize(nw);
1917
1918		RandomBits(x, l-1);
1919		SetBit(x, l-1);
1920		}
1921
1922
1923		const long RandomBndExcess = 8;
1924
	2036	RandomStream& stream = LocalGetCurrentRandomStream();
	2037
	2038	long nb = (l+7)/8;
	2039	unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
	2040
	2041	NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
	2042	Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
	2043
	2044	buf_mem.SetLength(nb);
	2045	unsigned char *buf = buf_mem.elts();
	2046
	2047	x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
	2048	// pre-allocate to ensure strong ES
	2049
	2050	stream.get(buf, nb);
	2051	buf[nb-1] &= mask;
	2052	buf[nb-1] \|= ((mask >> 1) + 1UL);
	2053
	2054	ZZFromBytes(x, buf, nb);
	2055	}
	2056
	2057
	2058
	2059
	2060
	2061	/**********************************************************
	2062
	2063	The following implementation of RandomBnd is designed
	2064	for speed. It certainly is not resilient against a
	2065	timing side-channel attack (but then again, none of these
	2066	PRG routines are designed to be).
	2067
	2068	The naive strategy generates random candidates of the right
	2069	bit length until the candidate < bnd.
	2070	The idea in this implementation is to generate the high
	2071	order two bytes of the candidate first, and compare this
	2072	to the high order two bytes of tmp. We can discard the
	2073	candidate if this is already too large.
	2074
	2075	***********************************************************/
1925	2076
1926	2077	void RandomBnd(ZZ& x, const ZZ& bnd)
1927	2078	{

1930	2081	return;
1931	2082	}
1932	2083
1933		long k = NumBits(bnd);
1934
1935		if (weight(bnd) == 1) {
1936		RandomBits(x, k-1);
1937		return;
1938		}
1939
1940		long l = k + RandomBndExcess;
1941
1942		NTL_ZZRegister(t);
1943		NTL_ZZRegister(r);
1944		NTL_ZZRegister(t1);
1945
1946		do {
1947		RandomBits(t, l);
1948		rem(r, t, bnd);
1949		sub(t1, bnd, r);
1950		add(t, t, t1);
1951		} while (NumBits(t) > l);
1952
1953		x = r;
1954		}
1955
1956		long RandomBnd(long bnd)
1957		{
1958		if (bnd <= 1) return 0;
1959
1960		long k = NumBits(bnd);
1961
1962		if (((bnd - 1) & bnd) == 0)
1963		return RandomBits_long(k-1);
1964
1965		long l = k + RandomBndExcess;
1966
1967		if (l > NTL_BITS_PER_LONG-2) {
1968		NTL_ZZRegister(Bnd);
1969		NTL_ZZRegister(res);
1970
1971		Bnd = bnd;
1972		RandomBnd(res, Bnd);
1973		return to_long(res);
1974		}
1975
1976		long t, r;
1977
1978		do {
1979		t = RandomBits_long(l);
1980		r = t % bnd;
1981		} while (t + bnd - r > (1L << l));
1982
1983		return r;
	2084	RandomStream& stream = LocalGetCurrentRandomStream();
	2085
	2086	long l = NumBits(bnd);
	2087	long nb = (l+7)/8;
	2088
	2089	if (nb <= 3) {
	2090	long lbnd = conv<long>(bnd);
	2091	unsigned char lbuf[3];
	2092	long ltmp;
	2093
	2094	x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
	2095	// pre-allocate to ensure strong ES
	2096	do {
	2097	stream.get(lbuf, nb);
	2098	ltmp = long(WordFromBytes(lbuf, nb) & ((1UL << l)-1UL));
	2099	} while (ltmp >= lbnd);
	2100
	2101	conv(x, ltmp);
	2102	return;
	2103	}
	2104
	2105	// deal with possible alias
	2106	NTL_ZZRegister(tmp_store);
	2107	const ZZ& bnd_ref = ((&x == &bnd) ? (tmp_store = bnd) : bnd);
	2108
	2109
	2110	NTL_ZZRegister(hbnd);
	2111	RightShift(hbnd, bnd_ref, (nb-2)*8);
	2112	long lhbnd = conv<long>(hbnd);
	2113
	2114	unsigned long mask = (1UL << (16 - nb*8 + l)) - 1UL;
	2115
	2116	NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
	2117	Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
	2118	buf_mem.SetLength(nb);
	2119	unsigned char *buf = buf_mem.elts();
	2120
	2121	unsigned char hbuf[2];
	2122
	2123	x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
	2124	// pre-allocate to ensure strong ES
	2125	for (;;) {
	2126	stream.get(hbuf, 2);
	2127	long hpart = long(WordFromBytes(hbuf, 2) & mask);
	2128
	2129	if (hpart > lhbnd) continue;
	2130
	2131	stream.get(buf, nb-2);
	2132	buf[nb-2] = ((unsigned long) hpart);
	2133	buf[nb-1] = ((unsigned long) hpart) >> 8;
	2134
	2135	ZZFromBytes(x, buf, nb);
	2136	if (hpart < lhbnd \|\| x < bnd_ref) break;
	2137	}
1984	2138	}
1985	2139
1986	2140

1991	2145	static
1992	2146	double Log2(double x)
1993	2147	{
1994		NTL_THREAD_LOCAL static double log2 = log(2.0);
	2148	static const double log2 = log(2.0); // GLOBAL (relies on C++11 thread-safe init)
1995	2149	return log(x)/log2;
1996	2150	}
1997	2151

-1

src/ZZX.c less more

8	8
9	9	const ZZX& ZZX::zero()
10	10	{
11		NTL_THREAD_LOCAL static ZZX z;
	11	static const ZZX z; // GLOBAL (relies on C++11 thread-safe init)
12	12	return z;
13	13	}
14	14

+25

-6

src/ZZX1.c less more

820	820	}
821	821
822	822
823		if (maxa + maxb >= 30 &&
824		SSRatio(deg(a), MaxBits(a), deg(b), MaxBits(b)) < 1.25) {
	823	double rat = SSRatio(deg(a), MaxBits(a), deg(b), MaxBits(b));
	824	long k1 = (maxa + maxb)/2;
	825
	826	if (
	827
	828	(k1 >= 26 && rat < 1.40) \|\|
	829	(k1 >= 53 && rat < 1.60) \|\|
	830	(k1 >= 106 && rat < 1.80) \|\|
	831	(k1 >= 212 && rat < 2.00)
	832
	833	) {
825	834	SSMul(c, a, b);
826	835	}
827	836	else {

1009	1018	}
1010	1019
1011	1020	long mba = MaxBits(a);
1012
1013		if (2*maxa >= 30 &&
1014		SSRatio(deg(a), mba, deg(a), mba) < 1.25)
	1021	double rat = SSRatio(deg(a), mba, deg(a), mba);
	1022	long k1 = maxa;
	1023
	1024	if (
	1025
	1026	(k1 >= 26 && rat < 1.40) \|\|
	1027	(k1 >= 53 && rat < 1.60) \|\|
	1028	(k1 >= 106 && rat < 1.80) \|\|
	1029	(k1 >= 212 && rat < 2.00)
	1030
	1031	) {
1015	1032	SSSqr(c, a);
1016		else
	1033	}
	1034	else {
1017	1035	HomSqr(c, a);
	1036	}
1018	1037	}
1019	1038
1020	1039

-8

src/ZZXFactoring.c less more

10	10
11	11	NTL_START_IMPL
12	12
13		NTL_THREAD_LOCAL long ZZXFac_van_Hoeij = 1;
14
15		NTL_THREAD_LOCAL static long ok_to_abandon = 0;
	13	NTL_CHEAP_THREAD_LOCAL long ZZXFac_van_Hoeij = 1;
	14
	15	static NTL_CHEAP_THREAD_LOCAL long ok_to_abandon = 0;
16	16
17	17	struct LocalInfoT {
18	18	long n;

463	463	f.normalize();
464	464	}
465	465
466		NTL_THREAD_LOCAL long ZZXFac_InitNumPrimes = 7;
467		NTL_THREAD_LOCAL long ZZXFac_MaxNumPrimes = 50;
	466	NTL_CHEAP_THREAD_LOCAL long ZZXFac_InitNumPrimes = 7;
	467	NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxNumPrimes = 50;
468	468
469	469	static
470	470	void RecordPattern(vec_long& pat, vec_pair_zz_pX_long& fac)

846	846	const vec_ZZ_pX& W, const vec_ZZX& factors,
847	847	const ZZX& f, long k, long verbose)
848	848	{
849		NTL_THREAD_LOCAL static long cnt = 0;
	849	static NTL_CHEAP_THREAD_LOCAL long cnt = 0;
850	850
851	851	if (verbose) {
852	852	cnt = (cnt + 1) % 100;

1610	1610	}
1611	1611
1612	1612
1613		NTL_THREAD_LOCAL long ZZXFac_MaxPrune = 10;
	1613	NTL_CHEAP_THREAD_LOCAL long ZZXFac_MaxPrune = 10;
1614	1614
1615	1615
1616	1616

3652	3652	}
3653	3653	}
3654	3654
3655		NTL_THREAD_LOCAL long ZZXFac_PowerHack = 1;
	3655	NTL_CHEAP_THREAD_LOCAL long ZZXFac_PowerHack = 1;
3656	3656
3657	3657	void SFFactor(vec_ZZX& factors, const ZZX& ff,
3658	3658	long verbose,

+56

-27

src/ZZ_p.c less more

9	9
10	10
11	11
12		NTL_THREAD_LOCAL SmartPtr<ZZ_pInfoT> ZZ_pInfo = 0;
13		NTL_THREAD_LOCAL SmartPtr<ZZ_pTmpSpaceT> ZZ_pTmpSpace = 0;
14		NTL_THREAD_LOCAL bool ZZ_pInstalled = false;
	12	NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pInfoT>, ZZ_pInfo_stg)
	13	NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pTmpSpaceT>, ZZ_pTmpSpace_stg)
	14
	15	NTL_CHEAP_THREAD_LOCAL ZZ_pInfoT *ZZ_pInfo = 0;
	16	NTL_CHEAP_THREAD_LOCAL ZZ_pTmpSpaceT *ZZ_pTmpSpace = 0;
	17	NTL_CHEAP_THREAD_LOCAL bool ZZ_pInstalled = false;
15	18
16	19
17	20

91	94
92	95	double fn = double(n);
93	96
	97	// NOTE: these next two range checks are somewhat academic,
	98	// but various bits of code in the ZZ_pX implementation
	99	// implicitly rely on them
	100
94	101	if (8.0fn(fn+48) > NTL_FDOUBLE_PRECISION)
95	102	ResourceError("modulus too big");
96	103
97
98		if (8.0fn(fn+48) <= NTL_FDOUBLE_PRECISION/double(NTL_SP_BOUND))
99		FFTInfo->QuickCRT = true;
100		else
101		FFTInfo->QuickCRT = false;
102
103		// FIXME: some of this stuff does not need to be initialized
104		// at all if FFTInfo->crt_struct.special()
105
106		FFTInfo->x.SetLength(n);
107		FFTInfo->u.SetLength(n);
108		FFTInfo->uqinv.SetLength(n);
	104	if (n >= NTL_SP_BOUND)
	105	ResourceError("modulus too big");
	106
	107
109	108
110	109	FFTInfo->rem_struct.init(n, ZZ_pInfo->p, GetFFTPrime);
111
112	110	FFTInfo->crt_struct.init(n, ZZ_pInfo->p, GetFFTPrime);
113	111
114	112	if (!FFTInfo->crt_struct.special()) {
	113	FFTInfo->prime.SetLength(n);
	114	FFTInfo->prime_recip.SetLength(n);
	115	FFTInfo->u.SetLength(n);
	116	FFTInfo->uqinv.SetLength(n);
	117
	118	// montgomery
	119	FFTInfo->reduce_struct.init(ZZ_pInfo->p, ZZ(n) << NTL_SP_NBITS);
	120
115	121	ZZ qq, rr;
116	122
117	123	DivRem(qq, rr, M, ZZ_pInfo->p);
118	124
119	125	NegateMod(FFTInfo->MinusMModP, rr, ZZ_pInfo->p);
	126
	127	// montgomery
	128	FFTInfo->reduce_struct.adjust(FFTInfo->MinusMModP);
120	129
121	130	for (i = 0; i < n; i++) {
122	131	q = GetFFTPrime(i);

133	142	t = rem(M1, q);
134	143	t = InvMod(t, q);
135	144
136		mul(M3, M2, t);
137		rem(M3, M3, ZZ_pInfo->p);
138
139		FFTInfo->crt_struct.insert(i, M3);
140
141
142		FFTInfo->x[i] = ((double) t)/((double) q);
	145	// montgomery
	146	FFTInfo->reduce_struct.adjust(M2);
	147
	148	FFTInfo->crt_struct.insert(i, M2);
	149
	150	FFTInfo->prime[i] = q;
	151	FFTInfo->prime_recip[i] = 1/double(q);
143	152	FFTInfo->u[i] = t;
144	153	FFTInfo->uqinv[i] = PrepMulModPrecon(FFTInfo->u[i], q, qinv);
145	154	}
	155
146	156	}
147	157
148	158	tmps = MakeSmart<ZZ_pTmpSpaceT>();

159	169	tmps->rem_tmp_vec.fetch(FFTInfo->rem_struct);
160	170	}
161	171
162		ZZ_pTmpSpace = tmps;
	172	NTL_TLS_GLOBAL_ACCESS(ZZ_pTmpSpace_stg);
	173	ZZ_pTmpSpace_stg = tmps;
	174	ZZ_pTmpSpace = ZZ_pTmpSpace_stg.get();
163	175	}
164	176
165	177

172	184	}
173	185
174	186
	187	void ZZ_pContext::save()
	188	{
	189	NTL_TLS_GLOBAL_ACCESS(ZZ_pInfo_stg);
	190	ptr = ZZ_pInfo_stg;
	191	}
	192
175	193
176	194	void ZZ_pContext::restore() const
177	195	{
178		ZZ_pInfo = ptr;
	196	if (ZZ_pInfo == ptr.get()) return;
	197	// NOTE: this simple optimization could be useful in some situations,
	198	// for example, a worker thread re-setting the current modulus
	199	// in a multi-threaded build
	200
	201	NTL_TLS_GLOBAL_ACCESS(ZZ_pInfo_stg);
	202	ZZ_pInfo_stg = ptr;
	203	ZZ_pInfo = ZZ_pInfo_stg.get();
	204
	205	NTL_TLS_GLOBAL_ACCESS(ZZ_pTmpSpace_stg);
	206	ZZ_pTmpSpace_stg = 0;
179	207	ZZ_pTmpSpace = 0;
	208
180	209	ZZ_pInstalled = false;
181	210	}
182	211

203	232
204	233	const ZZ_p& ZZ_p::zero()
205	234	{
206		NTL_THREAD_LOCAL static ZZ_p z(INIT_NO_ALLOC);
	235	static const ZZ_p z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
207	236	return z;
208	237	}
209	238
210		NTL_THREAD_LOCAL
	239	NTL_CHEAP_THREAD_LOCAL
211	240	ZZ_p::DivHandlerPtr ZZ_p::DivHandler = 0;
212	241
213	242

+13

-8

src/ZZ_pE.c less more

4	4	#include <NTL/new.h>
5	5
6	6	NTL_START_IMPL
	7
	8
	9	NTL_TLS_GLOBAL_DECL(SmartPtr<ZZ_pEInfoT>, ZZ_pEInfo_stg)
	10
	11	NTL_CHEAP_THREAD_LOCAL
	12	ZZ_pEInfoT *ZZ_pEInfo = 0;
	13
7	14
8	15	ZZ_pEInfoT::ZZ_pEInfoT(const ZZ_pX& NewP)
9	16	{

32	39
33	40
34	41
35
36		NTL_THREAD_LOCAL
37		SmartPtr<ZZ_pEInfoT> ZZ_pEInfo = 0;
38
39
40	42	void ZZ_pE::init(const ZZ_pX& p)
41	43	{
42	44	ZZ_pEContext c(p);

46	48
47	49	void ZZ_pEContext::save()
48	50	{
49		ptr = ZZ_pEInfo;
	51	NTL_TLS_GLOBAL_ACCESS(ZZ_pEInfo_stg);
	52	ptr = ZZ_pEInfo_stg;
50	53	}
51	54
52	55	void ZZ_pEContext::restore() const
53	56	{
54		ZZ_pEInfo = ptr;
	57	NTL_TLS_GLOBAL_ACCESS(ZZ_pEInfo_stg);
	58	ZZ_pEInfo_stg = ptr;
	59	ZZ_pEInfo = ZZ_pEInfo_stg.get();
55	60	}
56	61
57	62

76	81
77	82	const ZZ_pE& ZZ_pE::zero()
78	83	{
79		static ZZ_pE z(INIT_NO_ALLOC);
	84	static const ZZ_pE z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
80	85	return z;
81	86	}
82	87

-2

src/ZZ_pEX.c less more

11	11
12	12	const ZZ_pEX& ZZ_pEX::zero()
13	13	{
14		NTL_THREAD_LOCAL static ZZ_pEX z;
	14	static const ZZ_pEX z; // GLOBAL (assumes C++11 thread-safe init)
15	15	return z;
16	16	}
17	17

2208	2208	MulMod(A.H[i], A.H[i-1], h, F);
2209	2209	}
2210	2210
2211		NTL_THREAD_LOCAL long ZZ_pEXArgBound = 0;
	2211	NTL_CHEAP_THREAD_LOCAL long ZZ_pEXArgBound = 0;
2212	2212
2213	2213
2214	2214

-6

src/ZZ_pEXFactoring.c less more

357	357	}
358	358
359	359
360		NTL_THREAD_LOCAL long ZZ_pEX_BlockingFactor = 10;
	360	NTL_CHEAP_THREAD_LOCAL long ZZ_pEX_BlockingFactor = 10;
361	361
362	362
363	363

1061	1061
1062	1062	/*********** NEW DDF **************/
1063	1063
1064		NTL_THREAD_LOCAL long ZZ_pEX_GCDTableSize = 4;
1065		NTL_THREAD_LOCAL double ZZ_pEXFileThresh = NTL_FILE_THRESH;
1066		NTL_THREAD_LOCAL static vec_ZZ_pEX *BabyStepFile=0;
1067		NTL_THREAD_LOCAL static vec_ZZ_pEX *GiantStepFile=0;
1068		NTL_THREAD_LOCAL static long use_files;
	1064	NTL_CHEAP_THREAD_LOCAL long ZZ_pEX_GCDTableSize = 4;
	1065	NTL_CHEAP_THREAD_LOCAL double ZZ_pEXFileThresh = NTL_FILE_THRESH;
	1066	static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pEX *BabyStepFile=0;
	1067	static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pEX *GiantStepFile=0;
	1068	static NTL_CHEAP_THREAD_LOCAL long use_files;
1069	1069
1070	1070
1071	1071	static

+1205

-205

src/ZZ_pX.c less more

0	0	#include <NTL/ZZ_pX.h>
	1	#include <NTL/BasicThreadPool.h>
	2	#include <NTL/new.h>
1	3
2	4
3	5	// The mul & sqr routines use routines from ZZX,

11	13
12	14	#endif
13	15
14		#include <NTL/new.h>
15	16
16	17
17	18	#if (defined(NTL_GMP_LIP))

29	30
30	31	const ZZ_pX& ZZ_pX::zero()
31	32	{
32		NTL_THREAD_LOCAL static ZZ_pX z;
	33	static const ZZ_pX z; // GLOBAL (relies on C++11 thread-safe init)
33	34	return z;
34	35	}
35	36

434	435
435	436	#ifndef NTL_WIZARD_HACK
436	437
437		// These crossovers are tuned for a Pentium, but hopefully
438		// they should be OK on other machines as well.
439
440
441		const long SS_kbound = 40;
442		const double SS_rbound = 1.25;
443
444	438
445	439	void mul(ZZ_pX& c, const ZZ_pX& a, const ZZ_pX& b)
446	440	{

472	466	else {
473	467	long mbits;
474	468	mbits = NumBits(ZZ_p::modulus());
475		if (k >= SS_kbound &&
476		SSRatio(deg(a), mbits, deg(b), mbits) < SS_rbound) {
	469
	470	long nt = 1;
	471	// FIXME: needs to be updated when I thread-enable the SS
	472	// mul routine
	473
	474	#ifdef NTL_THREAD_BOOST
	475	BasicThreadPool *pool = GetThreadPool();
	476	if (pool && !pool->active()) nt = pool->NumThreads();
	477	#endif
	478
	479	double rat = SSRatio(deg(a), mbits, deg(b), mbits);
	480
	481	if ( nt == 1 && (
	482
	483	(k >= 53 && rat < 1.10) \|\|
	484	(k >= 106 && rat < 1.30) \|\|
	485	(k >= 212 && rat < 1.75)
	486
	487	)) {
477	488	ZZX A, B, C;
478	489	conv(A, a);
479	490	conv(B, b);

512	523	else {
513	524	long mbits;
514	525	mbits = NumBits(ZZ_p::modulus());
515		if (k >= SS_kbound &&
516		SSRatio(deg(a), mbits, deg(a), mbits) < SS_rbound) {
	526
	527
	528	long nt = 1;
	529	// FIXME: needs to be updated when I thread-enable the SS
	530	// mul routine
	531
	532	#ifdef NTL_THREAD_BOOST
	533	BasicThreadPool *pool = GetThreadPool();
	534	if (pool && !pool->active()) nt = pool->NumThreads();
	535	#endif
	536
	537	double rat = SSRatio(deg(a), mbits, deg(a), mbits);
	538
	539	if ( nt == 1 && (
	540
	541	(k >= 53 && rat < 1.10) \|\|
	542	(k >= 106 && rat < 1.30) \|\|
	543	(k >= 212 && rat < 1.75)
	544
	545	)) {
517	546	ZZX A, C;
518	547	conv(A, a);
519	548	SSSqr(C, A);

983	1012	r.normalize();
984	1013	}
985	1014
	1015
	1016
	1017	NTL_TBDECL_static(MulAux)(ZZ_p* xp, const ZZ_p* ap, const ZZ_p& t, long n)
	1018	{
	1019	for (long i = 0; i < n; i++)
	1020	mul(xp[i], ap[i], t);
	1021	}
	1022
	1023	#ifdef NTL_THREAD_BOOST
	1024	static void MulAux(ZZ_p* xp, const ZZ_p* ap, const ZZ_p& t, long n)
	1025	{
	1026	BasicThreadPool *pool = GetThreadPool();
	1027
	1028	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1029	basic_MulAux(xp, ap, t, n);
	1030	return;
	1031	}
	1032
	1033	ZZ_pContext local_context;
	1034	local_context.save();
	1035
	1036	pool->exec_range(n,
	1037	[xp, ap, &t, &local_context](long first, long last) {
	1038	local_context.restore();
	1039	for (long i = first; i < last; i++)
	1040	mul(xp[i], ap[i], t);
	1041	} );
	1042	}
	1043	#endif
	1044
	1045
	1046
986	1047	void mul(ZZ_pX& x, const ZZ_pX& a, const ZZ_p& b)
987	1048	{
988	1049	if (IsZero(b)) {

997	1058
998	1059	NTL_ZZ_pRegister(t);
999	1060
1000		long i, da;
	1061	long da;
1001	1062
1002	1063	const ZZ_p *ap;
1003	1064	ZZ_p* xp;
1004
1005	1065
1006	1066	t = b;
1007	1067

1010	1070	ap = a.rep.elts();
1011	1071	xp = x.rep.elts();
1012	1072
1013		for (i = 0; i <= da; i++)
1014		mul(xp[i], ap[i], t);
	1073	MulAux(xp, ap, t, da+1);
1015	1074
1016	1075	x.normalize();
1017	1076	}

1167	1226	}
1168	1227
1169	1228
	1229	NTL_TBDECL_static(MulByXModAux1)(long n, ZZ_p hh, const ZZ_p aa, const ZZ_p *ff, const ZZ_p& z)
	1230	{
	1231	NTL_ZZ_pRegister(t);
	1232
	1233	for (long i = n-1; i >= 1; i--) {
	1234	// hh[i] = aa[i-1] + z*ff[i]
	1235	mul(t, z, ff[i]);
	1236	add(hh[i], aa[i-1], t);
	1237	}
	1238	}
	1239
	1240	#ifdef NTL_THREAD_BOOST
	1241
	1242	static void MulByXModAux1(long n, ZZ_p hh, const ZZ_p aa, const ZZ_p *ff, const ZZ_p& z)
	1243	{
	1244
	1245	BasicThreadPool *pool = GetThreadPool();
	1246
	1247	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1 \|\| hh == aa) {
	1248	// Careful! can't parallelize if hh == aa
	1249	basic_MulByXModAux1(n, hh, aa, ff, z);
	1250	return;
	1251	}
	1252
	1253	ZZ_pContext local_context;
	1254	local_context.save();
	1255
	1256	pool->exec_range(n-1,
	1257	[n, hh, aa, ff, &z, &local_context]
	1258	(long first, long last) {
	1259	local_context.restore();
	1260	NTL_ZZ_pRegister(t);
	1261
	1262	for (long idx = first; idx < last; idx++) {
	1263	long i = n-1-idx;
	1264	// hh[i] = aa[i-1] + z*ff[i]
	1265	mul(t, z, ff[i]);
	1266	add(hh[i], aa[i-1], t);
	1267	}
	1268	} );
	1269	}
	1270
	1271
	1272	#endif
1170	1273
1171	1274
1172	1275	static

1176	1279	ZZ_p* hh;
1177	1280	const ZZ_p aa, ff;
1178	1281
1179		ZZ_p t, z;
	1282	NTL_ZZ_pRegister(z);
1180	1283
1181	1284	n = deg(f);
1182	1285	m = deg(a);

1204	1307	negate(z, aa[n-1]);
1205	1308	if (!IsOne(ff[n]))
1206	1309	div(z, z, ff[n]);
1207		for (i = n-1; i >= 1; i--) {
1208		mul(t, z, ff[i]);
1209		add(hh[i], aa[i-1], t);
1210		}
	1310
	1311	MulByXModAux1(n, hh, aa, ff, z);
	1312
1211	1313	mul(hh[0], z, ff[0]);
1212	1314	h.normalize();
1213	1315	}

1323	1425
1324	1426
1325	1427
1326		NTL_THREAD_LOCAL static vec_long ModularRepBuf;
1327	1428	// FIXME: maybe I could put this is scratch space associated
1328	1429	// with the current modulus
	1430	static inline
	1431	vec_long& ModularRepBuf()
	1432	{
	1433	NTL_TLS_LOCAL(vec_long, t);
	1434	return t;
	1435	}
1329	1436
1330	1437
1331	1438	void ToModularRep(vec_long& x, const ZZ_p& a, const ZZ_pFFTInfoT *FFTInfo,

1335	1442	}
1336	1443
1337	1444
1338		// NOTE: earlier versions used Kahan summation...
1339		// we no longer do this, as it is less portable than I thought.
1340
1341		void FromModularRep(ZZ_p& x, const vec_long& a, const ZZ_pFFTInfoT *FFTInfo,
	1445	void FromModularRep(ZZ_p& x, vec_long& avec, const ZZ_pFFTInfoT *FFTInfo,
1342	1446	ZZ_pTmpSpaceT *TmpSpace)
1343		{
1344		long n = FFTInfo->NumPrimes;
1345		NTL_ZZRegister(q);
1346		NTL_ZZRegister(s);
	1447	// NOTE: a gets destroyed
	1448
	1449	{
1347	1450	NTL_ZZRegister(t);
1348		long i;
1349		double y;
	1451	long * NTL_RESTRICT a = avec.elts();
1350	1452
1351	1453	if (FFTInfo->crt_struct.special()) {
1352		FFTInfo->crt_struct.eval(t, &a[0], TmpSpace->crt_tmp_vec);
	1454	FFTInfo->crt_struct.eval(t, a, TmpSpace->crt_tmp_vec);
1353	1455	x.LoopHole() = t;
1354	1456	return;
1355	1457	}
	1458
	1459	long nprimes = FFTInfo->NumPrimes;
	1460	const long *u = FFTInfo->u.elts();
	1461	const long *prime = FFTInfo->prime.elts();
	1462	const mulmod_precon_t *uqinv = FFTInfo->uqinv.elts();
	1463	const double *prime_recip = FFTInfo->prime_recip.elts();
1356	1464
1357
1358		if (FFTInfo->QuickCRT) {
1359		y = double(0L);
1360		for (i = 0; i < n; i++)
1361		y += ((double) a[i])*FFTInfo->x[i];
1362
1363		conv(q, (y + 0.5));
1364		}
	1465	double y = 0.0;
	1466
	1467	for (long i = 0; i < nprimes; i++) {
	1468	long r = MulModPrecon(a[i], u[i], prime[i], uqinv[i]);
	1469	a[i] = r;
	1470	y += double(r)*prime_recip[i];
	1471	}
	1472
	1473	long q = long(y + 0.5);
	1474
	1475	FFTInfo->crt_struct.eval(t, a, TmpSpace->crt_tmp_vec);
	1476
	1477	MulAddTo(t, FFTInfo->MinusMModP, q);
	1478	// TODO: this MulAddTo could be folded into the above
	1479	// crt_struct.eval as just another product to accumulate...
	1480	// but, savings would be marginal and a number of interfaces
	1481	// would have to be modified...
	1482
	1483	// montgomery
	1484	FFTInfo->reduce_struct.eval(x.LoopHole(), t);
	1485	}
	1486
	1487
	1488
	1489
	1490
	1491	NTL_TBDECL(ToFFTRep)(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
	1492	// computes an n = 2^k point convolution.
	1493	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
	1494	{
	1495	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	1496	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	1497
	1498
	1499	long n, i, j, m, j1;
	1500	vec_long& t = ModularRepBuf();
	1501
	1502
	1503	if (k > FFTInfo->MaxRoot)
	1504	ResourceError("Polynomial too big for FFT");
	1505
	1506	if (lo < 0)
	1507	LogicError("bad arg to ToFFTRep");
	1508
	1509	long nprimes = FFTInfo->NumPrimes;
	1510	t.SetLength(nprimes);
	1511
	1512	hi = min(hi, deg(x));
	1513
	1514	y.SetSize(k);
	1515
	1516	n = 1L << k;
	1517
	1518	m = max(hi-lo + 1, 0);
	1519
	1520	const ZZ_p *xx = x.rep.elts();
	1521
	1522	if (n >= m) {
	1523	for (j = 0; j < m; j++) {
	1524	ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
	1525	for (i = 0; i < nprimes; i++) {
	1526	y.tbl[i][j] = t[i];
	1527	}
	1528	}
	1529
	1530	if (n > m) {
	1531	for (i = 0; i < nprimes; i++) {
	1532	long *yp = &y.tbl[i][0];
	1533	for (j = m; j < n; j++) {
	1534	yp[j] = 0;
	1535	}
	1536	}
	1537	}
	1538	}
1365	1539	else {
1366		long Q, r;
1367		long qq;
1368
1369		y = double(0L);
1370
1371		clear(q);
1372
1373		for (i = 0; i < n; i++) {
1374		r = MulModPreconWithQuo(Q, a[i], FFTInfo->u[i], GetFFTPrime(i), FFTInfo->uqinv[i]);
1375		// FIXME: add to documented interface?
1376
1377		add(q, q, Q);
1378		y += double(r)*GetFFTPrimeRecip(i);
1379		}
1380
1381		qq = long(y + 0.5);
1382		add(q, q, qq);
1383		}
1384
1385		FFTInfo->crt_struct.eval(t, &a[0], TmpSpace->crt_tmp_vec);
1386
1387		mul(s, q, FFTInfo->MinusMModP);
1388		add(t, t, s);
1389
1390		conv(x, t);
1391		}
1392
1393
1394
	1540	NTL_ZZ_pRegister(accum);
	1541	for (j = 0; j < n; j++) {
	1542	accum = xx[j+lo];
	1543	for (j1 = j + n; j1 < m; j1 += n)
	1544	add(accum, accum, xx[j1+lo]);
	1545	ToModularRep(t, accum, FFTInfo, TmpSpace);
	1546	for (i = 0; i < nprimes; i++) {
	1547	y.tbl[i][j] = t[i];
	1548	}
	1549	}
	1550	}
	1551
	1552	// FIXME: something to think about...part of the above logic
	1553	// is essentially a matrix transpose, which could lead to bad
	1554	// cache performance. I don't really know if that is an issue.
	1555
	1556	for (i = 0; i < nprimes; i++) {
	1557	long *yp = &y.tbl[i][0];
	1558	FFTFwd(yp, yp, k, i);
	1559	}
	1560	}
	1561
	1562
	1563	#ifdef NTL_THREAD_BOOST
1395	1564
1396	1565	void ToFFTRep(FFTRep& y, const ZZ_pX& x, long k, long lo, long hi)
1397	1566	// computes an n = 2^k point convolution.
1398	1567	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1399	1568	{
	1569	BasicThreadPool *pool = GetThreadPool();
	1570
	1571	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1572	basic_ToFFTRep(y, x, k, lo, hi);
	1573	return;
	1574	}
	1575
	1576
	1577
	1578	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	1579
	1580	long n, m;
	1581
	1582
	1583	if (k > FFTInfo->MaxRoot)
	1584	ResourceError("Polynomial too big for FFT");
	1585
	1586	if (lo < 0)
	1587	LogicError("bad arg to ToFFTRep");
	1588
	1589	long nprimes = FFTInfo->NumPrimes;
	1590
	1591	hi = min(hi, deg(x));
	1592
	1593	y.SetSize(k);
	1594
	1595	n = 1L << k;
	1596
	1597	m = max(hi-lo + 1, 0);
	1598
	1599	const ZZ_p *xx = x.rep.elts();
	1600
	1601
	1602	ZZ_pContext local_context;
	1603	local_context.save();
	1604
	1605	if (n >= m) {
	1606	pool->exec_range(m,
	1607	[lo, xx, &y, nprimes, &local_context, FFTInfo]
	1608	(long first, long last) {
	1609
	1610	local_context.restore();
	1611	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	1612	// TmpSpace is thread local!
	1613
	1614	vec_long& t = ModularRepBuf();
	1615	t.SetLength(nprimes);
	1616
	1617	for (long j = first; j < last; j++) {
	1618	ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
	1619	for (long i = 0; i < nprimes; i++) {
	1620	y.tbl[i][j] = t[i];
	1621	}
	1622	}
	1623	} );
	1624	}
	1625	else {
	1626	pool->exec_range(n,
	1627	[lo, m, n, xx, &y, nprimes, &local_context, FFTInfo]
	1628	(long first, long last) {
	1629	local_context.restore();
	1630	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	1631	// TmpSpace is thread local!
	1632
	1633	vec_long& t = ModularRepBuf();
	1634	t.SetLength(nprimes);
	1635
	1636	NTL_ZZ_pRegister(accum);
	1637	for (long j = first; j < last; j++) {
	1638	accum = xx[j+lo];
	1639	for (long j1 = j + n; j1 < m; j1 += n)
	1640	add(accum, accum, xx[j1+lo]);
	1641	ToModularRep(t, accum, FFTInfo, TmpSpace);
	1642	for (long i = 0; i < nprimes; i++) {
	1643	y.tbl[i][j] = t[i];
	1644	}
	1645	}
	1646	} );
	1647	}
	1648
	1649	// FIXME: something to think about...part of the above logic
	1650	// is essentially a matrix transpose, which could lead to bad
	1651	// cache performance. I don't really know if that is an issue.
	1652
	1653	pool->exec_range(nprimes,
	1654	[&y, m, n, k](long first, long last) {
	1655	for (long i = first; i < last; i++) {
	1656	long *yp = &y.tbl[i][0];
	1657	for (long j = m; j < n; j++) yp[j] = 0;
	1658	FFTFwd(yp, yp, k, i);
	1659	}
	1660	} );
	1661	}
	1662
	1663	#endif
	1664
	1665
	1666
	1667	NTL_TBDECL(RevToFFTRep)(FFTRep& y, const vec_ZZ_p& x,
	1668	long k, long lo, long hi, long offset)
	1669	// computes an n = 2^k point convolution of X^offset*x[lo..hi] mod X^n-1
	1670	// using "inverted" evaluation points.
	1671
	1672	{
1400	1673	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1401	1674	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1402
	1675
1403	1676
1404	1677	long n, i, j, m, j1;
1405		vec_long& t = ModularRepBuf;
1406		ZZ_p accum;
1407
	1678	vec_long& t = ModularRepBuf();
	1679	NTL_ZZ_pRegister(accum);
1408	1680
1409	1681	if (k > FFTInfo->MaxRoot)
1410	1682	ResourceError("Polynomial too big for FFT");

1412	1684	if (lo < 0)
1413	1685	LogicError("bad arg to ToFFTRep");
1414	1686
1415		t.SetLength(FFTInfo->NumPrimes);
1416
1417		hi = min(hi, deg(x));
	1687	long nprimes = FFTInfo->NumPrimes;
	1688	t.SetLength(nprimes);
	1689
	1690	hi = min(hi, x.length()-1);
1418	1691
1419	1692	y.SetSize(k);
1420	1693

1422	1695
1423	1696	m = max(hi-lo + 1, 0);
1424	1697
1425		const ZZ_p *xx = x.rep.elts();
	1698	const ZZ_p *xx = x.elts();
	1699
	1700	offset = offset & (n-1);
1426	1701
1427	1702	for (j = 0; j < n; j++) {
1428	1703	if (j >= m) {
1429		for (i = 0; i < FFTInfo->NumPrimes; i++)
1430		y.tbl[i][j] = 0;
	1704	for (i = 0; i < nprimes; i++)
	1705	y.tbl[i][offset] = 0;
1431	1706	}
1432	1707	else {
1433	1708	accum = xx[j+lo];
1434	1709	for (j1 = j + n; j1 < m; j1 += n)
1435	1710	add(accum, accum, xx[j1+lo]);
1436	1711	ToModularRep(t, accum, FFTInfo, TmpSpace);
1437		for (i = 0; i < FFTInfo->NumPrimes; i++) {
1438		y.tbl[i][j] = t[i];
	1712	for (i = 0; i < nprimes; i++) {
	1713	y.tbl[i][offset] = t[i];
	1714
1439	1715	}
1440	1716	}
1441		}
1442
1443		// FIXME: something to think about...part of the above logic
1444		// is essentially a matrix transpose, which could lead to bad
1445		// cache performance. I don't really know if that is an issue.
1446
1447		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	1717
	1718	offset = (offset + 1) & (n-1);
	1719	}
	1720
	1721
	1722	for (i = 0; i < nprimes; i++) {
1448	1723	long *yp = &y.tbl[i][0];
1449		FFTFwd(yp, yp, k, i);
1450		}
1451		}
1452
1453
	1724	FFTRev1(yp, yp, k, i);
	1725	}
	1726
	1727	}
	1728
	1729
	1730
	1731	#ifdef NTL_THREAD_BOOST
1454	1732
1455	1733	void RevToFFTRep(FFTRep& y, const vec_ZZ_p& x,
1456	1734	long k, long lo, long hi, long offset)

1458	1736	// using "inverted" evaluation points.
1459	1737
1460	1738	{
	1739	BasicThreadPool *pool = GetThreadPool();
	1740
	1741	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1742	basic_RevToFFTRep(y, x, k, lo, hi, offset);
	1743	return;
	1744	}
	1745
1461	1746	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1462		ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1463
1464
1465		long n, i, j, m, j1;
1466		vec_long& t = ModularRepBuf;
1467		ZZ_p accum;
	1747
	1748	long n, m;
1468	1749
1469	1750	if (k > FFTInfo->MaxRoot)
1470	1751	ResourceError("Polynomial too big for FFT");

1472	1753	if (lo < 0)
1473	1754	LogicError("bad arg to ToFFTRep");
1474	1755
1475		t.SetLength(FFTInfo->NumPrimes);
	1756	long nprimes = FFTInfo->NumPrimes;
1476	1757
1477	1758	hi = min(hi, x.length()-1);
1478	1759

1486	1767
1487	1768	offset = offset & (n-1);
1488	1769
1489		for (j = 0; j < n; j++) {
1490		if (j >= m) {
1491		for (i = 0; i < FFTInfo->NumPrimes; i++)
1492		y.tbl[i][offset] = 0;
1493		}
1494		else {
1495		accum = xx[j+lo];
1496		for (j1 = j + n; j1 < m; j1 += n)
1497		add(accum, accum, xx[j1+lo]);
1498		ToModularRep(t, accum, FFTInfo, TmpSpace);
1499		for (i = 0; i < FFTInfo->NumPrimes; i++) {
1500		y.tbl[i][offset] = t[i];
1501
	1770	ZZ_pContext local_context;
	1771	local_context.save();
	1772
	1773	pool->exec_range(n,
	1774	[lo, m, n, offset, xx, &y, nprimes, &local_context, FFTInfo]
	1775	(long first, long last) {
	1776
	1777	local_context.restore();
	1778	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	1779	// TmpSpace is thread local!
	1780
	1781	vec_long& t = ModularRepBuf();
	1782	t.SetLength(nprimes);
	1783
	1784	long local_offset = (offset + first) & (n-1);
	1785
	1786	NTL_ZZ_pRegister(accum);
	1787
	1788	for (long j = first; j < last; j++) {
	1789	if (j >= m) {
	1790	for (long i = 0; i < nprimes; i++)
	1791	y.tbl[i][local_offset] = 0;
1502	1792	}
1503		}
1504
1505		offset = (offset + 1) & (n-1);
1506		}
1507
1508
1509		for (i = 0; i < FFTInfo->NumPrimes; i++) {
1510		long *yp = &y.tbl[i][0];
1511		FFTRev1(yp, yp, k, i);
1512		}
1513
1514		}
1515
1516		void FromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
	1793	else {
	1794	accum = xx[j+lo];
	1795	for (long j1 = j + n; j1 < m; j1 += n)
	1796	add(accum, accum, xx[j1+lo]);
	1797	ToModularRep(t, accum, FFTInfo, TmpSpace);
	1798	for (long i = 0; i < nprimes; i++) {
	1799	y.tbl[i][local_offset] = t[i];
	1800
	1801	}
	1802	}
	1803
	1804	local_offset = (local_offset + 1) & (n-1);
	1805	}
	1806	} );
	1807
	1808	pool->exec_range(nprimes,
	1809	[&y, k](long first, long last) {
	1810	for (long i = first; i < last; i++) {
	1811	long *yp = &y.tbl[i][0];
	1812	FFTRev1(yp, yp, k, i);
	1813	}
	1814	} );
	1815
	1816	}
	1817
	1818
	1819	#endif
	1820
	1821
	1822
	1823
	1824
	1825
	1826	NTL_TBDECL(FromFFTRep)(ZZ_pX& x, FFTRep& y, long lo, long hi)
1517	1827
1518	1828	// converts from FFT-representation to coefficient representation
1519	1829	// only the coefficients lo..hi are computed

1525	1835
1526	1836	long k, n, i, j, l;
1527	1837
1528		vec_long& t = ModularRepBuf;
1529
1530		t.SetLength(FFTInfo->NumPrimes);
	1838	vec_long& t = ModularRepBuf();
	1839
	1840	long nprimes = FFTInfo->NumPrimes;
	1841	t.SetLength(nprimes);
1531	1842
1532	1843	k = y.k;
1533	1844	n = (1L << k);
1534	1845
1535	1846
1536		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	1847	for (i = 0; i < nprimes; i++) {
1537	1848	long *yp = &y.tbl[i][0];
1538	1849	FFTRev1(yp, yp, k, i);
1539	1850	}

1544	1855	x.rep.SetLength(l);
1545	1856
1546	1857	for (j = 0; j < l; j++) {
1547		for (i = 0; i < FFTInfo->NumPrimes; i++)
	1858	for (i = 0; i < nprimes; i++)
1548	1859	t[i] = y.tbl[i][j+lo];
1549	1860
1550	1861	FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);

1553	1864	x.normalize();
1554	1865	}
1555	1866
1556		void RevFromFFTRep(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
	1867	#ifdef NTL_THREAD_BOOST
	1868
	1869	void FromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
	1870
	1871	// converts from FFT-representation to coefficient representation
	1872	// only the coefficients lo..hi are computed
	1873
	1874
	1875	{
	1876	BasicThreadPool *pool = GetThreadPool();
	1877
	1878	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1879	basic_FromFFTRep(x, y, lo, hi);
	1880	return;
	1881	}
	1882	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	1883
	1884	long k, n, l;
	1885
	1886	long nprimes = FFTInfo->NumPrimes;
	1887
	1888	k = y.k;
	1889	n = (1L << k);
	1890
	1891
	1892	pool->exec_range(nprimes,
	1893	[&y, k](long first, long last) {
	1894	for (long i = first; i < last; i++) {
	1895	long *yp = &y.tbl[i][0];
	1896	FFTRev1(yp, yp, k, i);
	1897	}
	1898	} );
	1899
	1900	hi = min(hi, n-1);
	1901	l = hi-lo+1;
	1902	l = max(l, 0);
	1903	x.rep.SetLength(l);
	1904	ZZ_p *xx = x.rep.elts();
	1905
	1906	ZZ_pContext local_context;
	1907	local_context.save();
	1908
	1909	pool->exec_range(l,
	1910	[lo, xx, &y, nprimes, &local_context, FFTInfo]
	1911	(long first, long last) {
	1912
	1913	local_context.restore();
	1914	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	1915	// TmpSpace is thread local!
	1916
	1917	vec_long& t = ModularRepBuf();
	1918	t.SetLength(nprimes);
	1919
	1920	for (long j = first; j < last; j++) {
	1921	for (long i = 0; i < nprimes; i++)
	1922	t[i] = y.tbl[i][j+lo];
	1923
	1924	FromModularRep(xx[j], t, FFTInfo, TmpSpace);
	1925	}
	1926	} );
	1927
	1928	x.normalize();
	1929	}
	1930
	1931
	1932
	1933	#endif
	1934
	1935
	1936
	1937
	1938
	1939
	1940	NTL_TBDECL(RevFromFFTRep)(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
1557	1941
1558	1942	// converts from FFT-representation to coefficient representation
1559	1943	// using "inverted" evaluation points.

1567	1951
1568	1952	long k, n, i, j, l;
1569	1953
1570		vec_long& t = ModularRepBuf;
	1954	vec_long& t = ModularRepBuf();
1571	1955
1572	1956	k = y.k;
1573	1957	n = (1L << k);
1574	1958
1575		t.SetLength(FFTInfo->NumPrimes);
1576
1577		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	1959	long nprimes = FFTInfo->NumPrimes;
	1960	t.SetLength(nprimes);
	1961
	1962	for (i = 0; i < nprimes; i++) {
1578	1963	long *yp = &y.tbl[i][0];
1579	1964	FFTFwd(yp, yp, k, i);
1580	1965	}

1585	1970	x.SetLength(l);
1586	1971
1587	1972	for (j = 0; j < l; j++) {
1588		for (i = 0; i < FFTInfo->NumPrimes; i++)
	1973	for (i = 0; i < nprimes; i++)
1589	1974	t[i] = y.tbl[i][j+lo];
1590	1975
1591	1976	FromModularRep(x[j], t, FFTInfo, TmpSpace);
1592	1977	}
1593	1978	}
1594	1979
1595		void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
	1980
	1981	#ifdef NTL_THREAD_BOOST
	1982
	1983	void RevFromFFTRep(vec_ZZ_p& x, FFTRep& y, long lo, long hi)
	1984	{
	1985	BasicThreadPool *pool = GetThreadPool();
	1986
	1987	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1988	basic_RevFromFFTRep(x, y, lo, hi);
	1989	return;
	1990	}
	1991	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	1992
	1993	long k, n, l;
	1994
	1995	long nprimes = FFTInfo->NumPrimes;
	1996
	1997	k = y.k;
	1998	n = (1L << k);
	1999
	2000
	2001	pool->exec_range(nprimes,
	2002	[&y, k](long first, long last) {
	2003	for (long i = first; i < last; i++) {
	2004	long *yp = &y.tbl[i][0];
	2005	FFTFwd(yp, yp, k, i);
	2006	}
	2007	} );
	2008
	2009	hi = min(hi, n-1);
	2010	l = hi-lo+1;
	2011	l = max(l, 0);
	2012	x.SetLength(l);
	2013	ZZ_p *xx = x.elts();
	2014
	2015	ZZ_pContext local_context;
	2016	local_context.save();
	2017
	2018	pool->exec_range(l,
	2019	[lo, xx, &y, nprimes, &local_context, FFTInfo]
	2020	(long first, long last) {
	2021
	2022	local_context.restore();
	2023	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	2024	// TmpSpace is thread local!
	2025
	2026	vec_long& t = ModularRepBuf();
	2027	t.SetLength(nprimes);
	2028
	2029	for (long j = first; j < last; j++) {
	2030	for (long i = 0; i < nprimes; i++)
	2031	t[i] = y.tbl[i][j+lo];
	2032
	2033	FromModularRep(xx[j], t, FFTInfo, TmpSpace);
	2034	}
	2035	} );
	2036
	2037	}
	2038
	2039
	2040
	2041
	2042	#endif
	2043
	2044
	2045
	2046
	2047
	2048
	2049
	2050	NTL_TBDECL(NDFromFFTRep)(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
1596	2051	{
1597	2052	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1598	2053	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();

1600	2055
1601	2056	long k, n, i, j, l;
1602	2057
1603		vec_long& t = ModularRepBuf;
1604
1605		t.SetLength(FFTInfo->NumPrimes);
	2058	vec_long& t = ModularRepBuf();
	2059
	2060	long nprimes = FFTInfo->NumPrimes;
	2061	t.SetLength(nprimes);
1606	2062	k = y.k;
1607	2063	n = (1L << k);
1608	2064
1609	2065	z.SetSize(k);
1610	2066
1611		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2067	for (i = 0; i < nprimes; i++) {
1612	2068	long *zp = &z.tbl[i][0];
1613	2069	const long *yp = &y.tbl[i][0];
1614	2070

1621	2077	x.rep.SetLength(l);
1622	2078
1623	2079	for (j = 0; j < l; j++) {
1624		for (i = 0; i < FFTInfo->NumPrimes; i++)
	2080	for (i = 0; i < nprimes; i++)
1625	2081	t[i] = z.tbl[i][j+lo];
1626	2082
1627	2083	FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);

1630	2086	x.normalize();
1631	2087	}
1632	2088
1633		void NDFromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
1634		{
1635		FFTRep z;
1636		NDFromFFTRep(x, y, lo, hi, z);
1637		}
1638
1639		void FromFFTRep(ZZ_p* x, FFTRep& y, long lo, long hi)
	2089	#ifdef NTL_THREAD_BOOST
	2090
	2091	void NDFromFFTRep(ZZ_pX& x, const FFTRep& y, long lo, long hi, FFTRep& z)
1640	2092
1641	2093	// converts from FFT-representation to coefficient representation
1642	2094	// only the coefficients lo..hi are computed
1643	2095
1644	2096
1645	2097	{
	2098	BasicThreadPool *pool = GetThreadPool();
	2099
	2100	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2101	basic_NDFromFFTRep(x, y, lo, hi, z);
	2102	return;
	2103	}
	2104	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2105
	2106	long k, n, l;
	2107
	2108	long nprimes = FFTInfo->NumPrimes;
	2109
	2110	k = y.k;
	2111	n = (1L << k);
	2112
	2113	z.SetSize(k);
	2114
	2115	pool->exec_range(nprimes,
	2116	[&y, &z, k](long first, long last) {
	2117	for (long i = first; i < last; i++) {
	2118	long *zp = &z.tbl[i][0];
	2119	const long *yp = &y.tbl[i][0];
	2120	FFTRev1(zp, yp, k, i);
	2121	}
	2122	} );
	2123
	2124	hi = min(hi, n-1);
	2125	l = hi-lo+1;
	2126	l = max(l, 0);
	2127	x.rep.SetLength(l);
	2128	ZZ_p *xx = x.rep.elts();
	2129
	2130	ZZ_pContext local_context;
	2131	local_context.save();
	2132
	2133	pool->exec_range(l,
	2134	[lo, xx, &z, nprimes, &local_context, FFTInfo]
	2135	(long first, long last) {
	2136
	2137	local_context.restore();
	2138	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	2139	// TmpSpace is thread local!
	2140
	2141	vec_long& t = ModularRepBuf();
	2142	t.SetLength(nprimes);
	2143
	2144	for (long j = first; j < last; j++) {
	2145	for (long i = 0; i < nprimes; i++)
	2146	t[i] = z.tbl[i][j+lo];
	2147
	2148	FromModularRep(xx[j], t, FFTInfo, TmpSpace);
	2149	}
	2150	} );
	2151
	2152	x.normalize();
	2153	}
	2154
	2155
	2156
	2157	#endif
	2158
	2159	void NDFromFFTRep(ZZ_pX& x, FFTRep& y, long lo, long hi)
	2160	{
	2161	FFTRep z;
	2162	NDFromFFTRep(x, y, lo, hi, z);
	2163	}
	2164
	2165	NTL_TBDECL(FromFFTRep)(ZZ_p* x, FFTRep& y, long lo, long hi)
	2166
	2167	// converts from FFT-representation to coefficient representation
	2168	// only the coefficients lo..hi are computed
	2169
	2170
	2171	{
1646	2172	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1647	2173	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1648	2174
1649	2175
1650	2176	long k, n, i, j;
1651	2177
1652		vec_long& t = ModularRepBuf;
	2178	vec_long& t = ModularRepBuf();
1653	2179
1654	2180	k = y.k;
1655	2181	n = (1L << k);
1656	2182
1657		t.SetLength(FFTInfo->NumPrimes);
1658
1659		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2183	long nprimes = FFTInfo->NumPrimes;
	2184	t.SetLength(nprimes);
	2185
	2186	for (i = 0; i < nprimes; i++) {
1660	2187	long *yp = &y.tbl[i][0];
1661	2188	FFTRev1(yp, yp, k, i);
1662	2189	}

1665	2192	if (j >= n)
1666	2193	clear(x[j-lo]);
1667	2194	else {
1668		for (i = 0; i < FFTInfo->NumPrimes; i++)
	2195	for (i = 0; i < nprimes; i++)
1669	2196	t[i] = y.tbl[i][j];
1670	2197
1671	2198	FromModularRep(x[j-lo], t, FFTInfo, TmpSpace);

1674	2201	}
1675	2202
1676	2203
1677		void mul(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2204	#ifdef NTL_THREAD_BOOST
	2205
	2206	void FromFFTRep(ZZ_p* x, FFTRep& y, long lo, long hi)
	2207
	2208	// converts from FFT-representation to coefficient representation
	2209	// only the coefficients lo..hi are computed
	2210
	2211
	2212	{
	2213	BasicThreadPool *pool = GetThreadPool();
	2214
	2215	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2216	basic_FromFFTRep(x, y, lo, hi);
	2217	return;
	2218	}
	2219
	2220	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2221
	2222
	2223	long k, n, l;
	2224
	2225	k = y.k;
	2226	n = (1L << k);
	2227
	2228	long nprimes = FFTInfo->NumPrimes;
	2229
	2230
	2231	pool->exec_range(nprimes,
	2232	[&y, k](long first, long last) {
	2233	for (long i = first; i < last; i++) {
	2234	long *yp = &y.tbl[i][0];
	2235	FFTRev1(yp, yp, k, i);
	2236	}
	2237	} );
	2238
	2239
	2240	ZZ_pContext local_context;
	2241	local_context.save();
	2242
	2243	pool->exec_range(hi-lo+1,
	2244	[n, lo, x, &y, nprimes, &local_context, FFTInfo]
	2245	(long first, long last) {
	2246
	2247	local_context.restore();
	2248	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	2249	// TmpSpace is thread local!
	2250
	2251	vec_long& t = ModularRepBuf();
	2252	t.SetLength(nprimes);
	2253
	2254	for (long idx = first; idx < last; idx++) {
	2255	long j = lo + idx;
	2256
	2257	if (j >= n)
	2258	clear(x[j-lo]);
	2259	else {
	2260	for (long i = 0; i < nprimes; i++)
	2261	t[i] = y.tbl[i][j];
	2262
	2263	FromModularRep(x[j-lo], t, FFTInfo, TmpSpace);
	2264	}
	2265	}
	2266	} );
	2267	}
	2268
	2269	#endif
	2270
	2271
	2272	NTL_TBDECL(mul)(FFTRep& z, const FFTRep& x, const FFTRep& y)
1678	2273	{
1679	2274	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1680	2275

1687	2282
1688	2283	z.SetSize(k);
1689	2284
1690		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2285	long nprimes = FFTInfo->NumPrimes;
	2286
	2287	for (i = 0; i < nprimes; i++) {
1691	2288	long *zp = &z.tbl[i][0];
1692	2289	const long *xp = &x.tbl[i][0];
1693	2290	const long *yp = &y.tbl[i][0];

1700	2297
1701	2298	}
1702	2299
1703		void sub(FFTRep& z, const FFTRep& x, const FFTRep& y)
1704		{
	2300
	2301	#ifdef NTL_THREAD_BOOST
	2302
	2303	void mul(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2304	{
	2305	BasicThreadPool *pool = GetThreadPool();
	2306
	2307	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2308	basic_mul(z, x, y);
	2309	return;
	2310	}
1705	2311	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1706	2312
1707		long k, n, i, j;
	2313	long k, n;
1708	2314
1709	2315	if (x.k != y.k) LogicError("FFT rep mismatch");
1710	2316

1713	2319
1714	2320	z.SetSize(k);
1715	2321
1716		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2322	long nprimes = FFTInfo->NumPrimes;
	2323
	2324	pool->exec_range(nprimes,
	2325	[&x, &y, &z, n](long first, long last) {
	2326	for (long i = first; i < last; i++) {
	2327	long *zp = &z.tbl[i][0];
	2328	const long *xp = &x.tbl[i][0];
	2329	const long *yp = &y.tbl[i][0];
	2330	long q = GetFFTPrime(i);
	2331	mulmod_t qinv = GetFFTPrimeInv(i);
	2332
	2333	for (long j = 0; j < n; j++)
	2334	zp[j] = NormalizedMulMod(xp[j], yp[j], q, qinv);
	2335	}
	2336	} );
	2337
	2338	}
	2339
	2340	#endif
	2341
	2342
	2343
	2344	NTL_TBDECL(sub)(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2345	{
	2346	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2347
	2348	long k, n, i, j;
	2349
	2350	if (x.k != y.k) LogicError("FFT rep mismatch");
	2351
	2352	k = x.k;
	2353	n = 1L << k;
	2354
	2355	z.SetSize(k);
	2356
	2357	long nprimes = FFTInfo->NumPrimes;
	2358
	2359	for (i = 0; i < nprimes; i++) {
1717	2360	long *zp = &z.tbl[i][0];
1718	2361	const long *xp = &x.tbl[i][0];
1719	2362	const long *yp = &y.tbl[i][0];

1722	2365	for (j = 0; j < n; j++)
1723	2366	zp[j] = SubMod(xp[j], yp[j], q);
1724	2367	}
1725		}
1726
1727		void add(FFTRep& z, const FFTRep& x, const FFTRep& y)
1728		{
	2368
	2369	}
	2370
	2371
	2372	#ifdef NTL_THREAD_BOOST
	2373
	2374	void sub(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2375	{
	2376	BasicThreadPool *pool = GetThreadPool();
	2377
	2378	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2379	basic_sub(z, x, y);
	2380	return;
	2381	}
1729	2382	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1730	2383
1731		long k, n, i, j;
	2384	long k, n;
1732	2385
1733	2386	if (x.k != y.k) LogicError("FFT rep mismatch");
1734	2387

1737	2390
1738	2391	z.SetSize(k);
1739	2392
1740		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2393	long nprimes = FFTInfo->NumPrimes;
	2394
	2395	pool->exec_range(nprimes,
	2396	[&x, &y, &z, n](long first, long last) {
	2397	for (long i = first; i < last; i++) {
	2398	long *zp = &z.tbl[i][0];
	2399	const long *xp = &x.tbl[i][0];
	2400	const long *yp = &y.tbl[i][0];
	2401	long q = GetFFTPrime(i);
	2402
	2403	for (long j = 0; j < n; j++)
	2404	zp[j] = SubMod(xp[j], yp[j], q);
	2405	}
	2406	} );
	2407
	2408	}
	2409
	2410	#endif
	2411
	2412
	2413
	2414	NTL_TBDECL(add)(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2415	{
	2416	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2417
	2418	long k, n, i, j;
	2419
	2420	if (x.k != y.k) LogicError("FFT rep mismatch");
	2421
	2422	k = x.k;
	2423	n = 1L << k;
	2424
	2425	z.SetSize(k);
	2426
	2427	long nprimes = FFTInfo->NumPrimes;
	2428
	2429	for (i = 0; i < nprimes; i++) {
1741	2430	long *zp = &z.tbl[i][0];
1742	2431	const long *xp = &x.tbl[i][0];
1743	2432	const long *yp = &y.tbl[i][0];

1746	2435	for (j = 0; j < n; j++)
1747	2436	zp[j] = AddMod(xp[j], yp[j], q);
1748	2437	}
1749		}
1750
1751
1752		void reduce(FFTRep& x, const FFTRep& a, long k)
	2438
	2439	}
	2440
	2441
	2442	#ifdef NTL_THREAD_BOOST
	2443
	2444	void add(FFTRep& z, const FFTRep& x, const FFTRep& y)
	2445	{
	2446	BasicThreadPool *pool = GetThreadPool();
	2447
	2448	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2449	basic_add(z, x, y);
	2450	return;
	2451	}
	2452	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2453
	2454	long k, n;
	2455
	2456	if (x.k != y.k) LogicError("FFT rep mismatch");
	2457
	2458	k = x.k;
	2459	n = 1L << k;
	2460
	2461	z.SetSize(k);
	2462
	2463	long nprimes = FFTInfo->NumPrimes;
	2464
	2465	pool->exec_range(nprimes,
	2466	[&x, &y, &z, n](long first, long last) {
	2467	for (long i = first; i < last; i++) {
	2468	long *zp = &z.tbl[i][0];
	2469	const long *xp = &x.tbl[i][0];
	2470	const long *yp = &y.tbl[i][0];
	2471	long q = GetFFTPrime(i);
	2472
	2473	for (long j = 0; j < n; j++)
	2474	zp[j] = AddMod(xp[j], yp[j], q);
	2475	}
	2476	} );
	2477
	2478	}
	2479
	2480	#endif
	2481
	2482
	2483
	2484
	2485
	2486
	2487	NTL_TBDECL(reduce)(FFTRep& x, const FFTRep& a, long k)
1753	2488	// reduces a 2^l point FFT-rep to a 2^k point FFT-rep
1754	2489	// input may alias output
1755	2490	{

1766	2501
1767	2502	x.SetSize(k);
1768	2503
1769		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2504
	2505	long nprimes = FFTInfo->NumPrimes;
	2506
	2507	for (i = 0; i < nprimes; i++) {
1770	2508	ap = &a.tbl[i][0];
1771	2509	xp = &x.tbl[i][0];
1772	2510	for (j = 0; j < n; j++)

1774	2512	}
1775	2513	}
1776	2514
1777		void AddExpand(FFTRep& x, const FFTRep& a)
	2515
	2516	#ifdef NTL_THREAD_BOOST
	2517
	2518	void reduce(FFTRep& x, const FFTRep& a, long k)
	2519	// reduces a 2^l point FFT-rep to a 2^k point FFT-rep
	2520	// input may alias output
	2521	{
	2522	BasicThreadPool *pool = GetThreadPool();
	2523
	2524	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2525	basic_reduce(x, a, k);
	2526	return;
	2527	}
	2528
	2529	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2530
	2531	long l, n;
	2532
	2533	l = a.k;
	2534	n = 1L << k;
	2535
	2536	if (l < k) LogicError("reduce: bad operands");
	2537
	2538	x.SetSize(k);
	2539
	2540
	2541	long nprimes = FFTInfo->NumPrimes;
	2542
	2543	pool->exec_range(nprimes,
	2544	[&x, &a, n, l, k](long first, long last) {
	2545	for (long i = first; i < last; i++) {
	2546	const long *ap = &a.tbl[i][0];
	2547	long *xp = &x.tbl[i][0];
	2548	for (long j = 0; j < n; j++)
	2549	xp[j] = ap[j << (l-k)];
	2550	}
	2551	} );
	2552	}
	2553
	2554	#endif
	2555
	2556
	2557
	2558
	2559	NTL_TBDECL(AddExpand)(FFTRep& x, const FFTRep& a)
1778	2560	// x = x + (an "expanded" version of a)
1779	2561	{
1780	2562	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();

1787	2569
1788	2570	if (l < k) LogicError("AddExpand: bad args");
1789	2571
1790		for (i = 0; i < FFTInfo->NumPrimes; i++) {
	2572
	2573	long nprimes = FFTInfo->NumPrimes;
	2574
	2575	for (i = 0; i < nprimes; i++) {
1791	2576	long q = GetFFTPrime(i);
1792	2577	const long *ap = &a.tbl[i][0];
1793	2578	long *xp = &x.tbl[i][0];

1798	2583	}
1799	2584	}
1800	2585
1801
1802
1803		void ToZZ_pXModRep(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
	2586	#ifdef NTL_THREAD_BOOST
	2587
	2588	void AddExpand(FFTRep& x, const FFTRep& a)
	2589	// x = x + (an "expanded" version of a)
	2590	{
	2591	BasicThreadPool *pool = GetThreadPool();
	2592
	2593	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2594	basic_AddExpand(x, a);
	2595	return;
	2596	}
	2597	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2598
	2599	long l, k, n;
	2600
	2601	l = x.k;
	2602	k = a.k;
	2603	n = 1L << k;
	2604
	2605	if (l < k) LogicError("AddExpand: bad args");
	2606
	2607
	2608	long nprimes = FFTInfo->NumPrimes;
	2609
	2610	pool->exec_range(nprimes,
	2611	[&x, &a, n, l, k](long first, long last) {
	2612	for (long i = first; i < last; i++) {
	2613	long q = GetFFTPrime(i);
	2614	const long *ap = &a.tbl[i][0];
	2615	long *xp = &x.tbl[i][0];
	2616	for (long j = 0; j < n; j++) {
	2617	long j1 = j << (l-k);
	2618	xp[j1] = AddMod(xp[j1], ap[j], q);
	2619	}
	2620	}
	2621	} );
	2622	}
	2623
	2624
	2625	#endif
	2626
	2627
	2628
	2629
	2630
	2631	NTL_TBDECL(ToZZ_pXModRep)(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
1804	2632	{
1805	2633	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1806	2634	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
1807	2635
1808	2636
1809	2637	long n, i, j;
1810		vec_long& t = ModularRepBuf;
1811
	2638	vec_long& t = ModularRepBuf();
	2639
	2640
	2641	long nprimes = FFTInfo->NumPrimes;
1812	2642	t.SetLength(FFTInfo->NumPrimes);
1813	2643
1814	2644	if (lo < 0)

1822	2652
1823	2653	for (j = 0; j < n; j++) {
1824	2654	ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
1825		for (i = 0; i < FFTInfo->NumPrimes; i++)
	2655	for (i = 0; i < nprimes; i++)
1826	2656	y.tbl[i][j] = t[i];
1827	2657	}
1828	2658	}
1829	2659
1830
1831		void ToFFTRep(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
1832		{
	2660	#ifdef NTL_THREAD_BOOST
	2661	void ToZZ_pXModRep(ZZ_pXModRep& y, const ZZ_pX& x, long lo, long hi)
	2662	{
	2663	BasicThreadPool *pool = GetThreadPool();
	2664
	2665	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2666	basic_ToZZ_pXModRep(y, x, lo, hi);
	2667	return;
	2668	}
	2669
1833	2670	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
1834	2671
1835		vec_long s;
	2672
	2673	long n;
	2674
	2675	long nprimes = FFTInfo->NumPrimes;
	2676
	2677	if (lo < 0)
	2678	LogicError("bad arg to ToZZ_pXModRep");
	2679
	2680	hi = min(hi, deg(x));
	2681	n = max(hi-lo+1, 0);
	2682
	2683	y.SetSize(n);
	2684
	2685	const ZZ_p *xx = x.rep.elts();
	2686
	2687	ZZ_pContext local_context;
	2688	local_context.save();
	2689
	2690	pool->exec_range(n,
	2691	[lo, xx, &y, nprimes, &local_context, FFTInfo](long first, long last) {
	2692
	2693	local_context.restore();
	2694	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	2695	// TmpSpace is thread local!
	2696
	2697	vec_long& t = ModularRepBuf();
	2698	t.SetLength(nprimes);
	2699
	2700	for (long j = first; j < last; j++) {
	2701	ToModularRep(t, xx[j+lo], FFTInfo, TmpSpace);
	2702	for (long i = 0; i < nprimes; i++)
	2703	y.tbl[i][j] = t[i];
	2704	}
	2705	} );
	2706	}
	2707	#endif
	2708
	2709
	2710
	2711
	2712
	2713
	2714
	2715
	2716
	2717	NTL_TBDECL(ToFFTRep)(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
	2718	{
	2719	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2720
1836	2721	long n, m, i, j;
1837	2722
1838	2723	if (k < 0 \|\| lo < 0)

1846	2731	if (m > n)
1847	2732	LogicError("bad args to ToFFTRep");
1848	2733
1849		s.SetLength(n);
1850		long *sp = s.elts();
1851	2734
1852	2735	x.SetSize(k);
1853	2736
1854		long NumPrimes = FFTInfo->NumPrimes;
1855
1856		for (i = 0; i < NumPrimes; i++) {
	2737	long nprimes = FFTInfo->NumPrimes;
	2738
	2739	if (m == 0) {
	2740	for (i = 0; i < nprimes; i++) {
	2741	long *xp = &x.tbl[i][0];
	2742	for (j = m; j < n; j++)
	2743	xp[j] = 0;
	2744	}
	2745	}
	2746	else {
	2747	for (i = 0; i < nprimes; i++) {
	2748	long *xp = &x.tbl[i][0];
	2749	long *ap = &a.tbl[i][0];
	2750	for (j = 0; j < m; j++)
	2751	xp[j] = ap[lo+j];
	2752	for (j = m; j < n; j++)
	2753	xp[j] = 0;
	2754
	2755	FFTFwd(xp, xp, k, i);
	2756	}
	2757	}
	2758	}
	2759
	2760	#ifdef NTL_THREAD_BOOST
	2761	void ToFFTRep(FFTRep& x, const ZZ_pXModRep& a, long k, long lo, long hi)
	2762	{
	2763	BasicThreadPool *pool = GetThreadPool();
	2764
	2765	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	2766	basic_ToFFTRep(x, a, k, lo, hi);
	2767	return;
	2768	}
	2769
	2770	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2771
	2772	long n, m;
	2773
	2774	if (k < 0 \|\| lo < 0)
	2775	LogicError("bad args to ToFFTRep");
	2776
	2777	if (hi > a.n-1) hi = a.n-1;
	2778
	2779	n = 1L << k;
	2780	m = max(hi-lo+1, 0);
	2781
	2782	if (m > n)
	2783	LogicError("bad args to ToFFTRep");
	2784
	2785
	2786	x.SetSize(k);
	2787
	2788	long nprimes = FFTInfo->NumPrimes;
	2789
	2790	if (m == 0) {
	2791	for (long i = 0; i < nprimes; i++) {
	2792	long *xp = &x.tbl[i][0];
	2793	for (long j = m; j < n; j++)
	2794	xp[j] = 0;
	2795	}
	2796	}
	2797	else {
	2798
	2799	pool->exec_range(nprimes,
	2800	[&x, &a, lo, m, n, k](long first, long last) {
	2801
	2802	for (long i = first; i < last; i++) {
	2803	long *xp = &x.tbl[i][0];
	2804	long *ap = &a.tbl[i][0];
	2805	for (long j = 0; j < m; j++)
	2806	xp[j] = ap[lo+j];
	2807	for (long j = m; j < n; j++)
	2808	xp[j] = 0;
	2809
	2810	FFTFwd(xp, xp, k, i);
	2811	}
	2812	} );
	2813
	2814	}
	2815	}
	2816	#endif
	2817
	2818
	2819
	2820	void FromFFTRep(ZZ_pXModRep& x, const FFTRep& a)
	2821	{
	2822	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2823	long nprimes = FFTInfo->NumPrimes;
	2824	long k = a.k;
	2825	long n = 1L << k;
	2826
	2827	x.SetSize(n);
	2828	for (long i = 0; i < nprimes; i++) {
1857	2829	long *xp = &x.tbl[i][0];
1858		long *ap = (m == 0 ? 0 : &a.tbl[i][0]);
1859		for (j = 0; j < m; j++)
1860		sp[j] = ap[lo+j];
1861		for (j = m; j < n; j++)
1862		sp[j] = 0;
1863
1864		FFTFwd(xp, sp, k, i);
1865		}
1866		}
	2830	long *ap = &a.tbl[i][0];
	2831	FFTRev1(xp, ap, k, i);
	2832	}
	2833	}
	2834
	2835	void FromZZ_pXModRep(ZZ_pX& x, const ZZ_pXModRep& a, long lo, long hi)
	2836	{
	2837	const ZZ_pFFTInfoT *FFTInfo = ZZ_p::GetFFTInfo();
	2838	ZZ_pTmpSpaceT *TmpSpace = ZZ_p::GetTmpSpace();
	2839
	2840	long n = a.n;
	2841	long nprimes = FFTInfo->NumPrimes;
	2842
	2843	vec_long& t = ModularRepBuf();
	2844	t.SetLength(nprimes);
	2845
	2846	hi = min(hi, n-1);
	2847	long l = hi-lo+1;
	2848	l = max(l, 0);
	2849	x.rep.SetLength(l);
	2850
	2851	for (long j = 0; j < l; j++) {
	2852	for (long i = 0; i < nprimes; i++)
	2853	t[i] = a.tbl[i][j+lo];
	2854
	2855	FromModularRep(x.rep[j], t, FFTInfo, TmpSpace);
	2856	}
	2857
	2858	x.normalize();
	2859	}
	2860
	2861
1867	2862
1868	2863
1869	2864

2626	3621	long n = NumBits(e);
2627	3622	long i;
2628	3623
2629		ZZ_pX h;
	3624	ZZ_pX h, h1;
2630	3625
2631	3626	h.SetMaxLength(F.n);
2632	3627	set(h);
2633	3628
2634	3629	for (i = n - 1; i >= 0; i--) {
2635		SqrMod(h, h, F);
2636		if (bit(e, i))
2637		MulByXMod(h, h, F);
	3630	if (bit(e, i)) {
	3631	SqrMod(h1, h, F);
	3632	MulByXMod(h, h1, F);
	3633	// NOTE: MulByXMod gives much faster multicore performance
	3634	// when output does not alias input
	3635	}
	3636	else
	3637	SqrMod(h, h, F);
2638	3638	}
2639	3639
2640	3640	if (e < 0) InvMod(h, h, F);

+114

-4

src/ZZ_pX1.c less more

0	0
1	1	#include <NTL/ZZ_pX.h>
2
	2	#include <NTL/BasicThreadPool.h>
3	3	#include <NTL/new.h>
	4
	5
	6
	7
	8
4	9
5	10	NTL_START_IMPL
6	11

898	903
899	904
900	905
901		void InnerProduct(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
	906
	907	NTL_TBDECL(InnerProduct)(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
902	908	const vec_ZZ_pX& H, long n, ZZVec& t)
903	909	{
904	910	NTL_ZZRegister(s);

924	930	conv(x.rep[j], t[j]);
925	931	x.normalize();
926	932	}
	933
	934
	935	#ifdef NTL_THREAD_BOOST
	936
	937	void InnerProduct(ZZ_pX& x, const vec_ZZ_p& v, long low, long high,
	938	const vec_ZZ_pX& H, long n, ZZVec& t)
	939	{
	940	BasicThreadPool *pool = GetThreadPool();
	941
	942	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	943	basic_InnerProduct(x, v, low, high, H, n, t);
	944	return;
	945	}
	946
	947	high = min(high, v.length()-1);
	948	x.rep.SetLength(n);
	949
	950	ZZ_pContext local_context;
	951	local_context.save();
	952
	953	pool->exec_range(n,
	954	[low, high, &x, &t, &H, &v, &local_context](long first, long last) {
	955
	956	local_context.restore();
	957
	958	NTL_ZZRegister(s);
	959
	960	for (long j = first; j < last; j++) clear(t[j]);
	961
	962	for (long i = low; i <= high; i++) {
	963	const vec_ZZ_p& h = H[i-low].rep;
	964	long m = min(h.length(), last);
	965	const ZZ& w = rep(v[i]);
	966
	967	for (long j = first; j < m; j++) {
	968	mul(s, w, rep(h[j]));
	969	add(t[j], t[j], s);
	970	}
	971	}
	972
	973	for (long j = first; j < last; j++) conv(x.rep[j], t[j]);
	974	} );
	975
	976	x.normalize();
	977	}
	978
	979	#endif
927	980
928	981
929	982	void CompMod(ZZ_pX& x, const ZZ_pX& g, const ZZ_pXArgument& A,

987	1040
988	1041
989	1042
990		NTL_THREAD_LOCAL long ZZ_pXArgBound = 0;
	1043	NTL_CHEAP_THREAD_LOCAL long ZZ_pXArgBound = 0;
991	1044
992	1045
993	1046	void CompMod(ZZ_pX& x, const ZZ_pX& g, const ZZ_pX& h, const ZZ_pXModulus& F)

1144	1197
1145	1198
1146	1199
1147		void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
	1200	NTL_TBDECL(ProjectPowers)(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
1148	1201	const ZZ_pXArgument& H, const ZZ_pXModulus& F)
1149	1202
1150	1203	{

1178	1231	}
1179	1232	}
1180	1233
	1234
	1235	#ifdef NTL_THREAD_BOOST
	1236
	1237	void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,
	1238	const ZZ_pXArgument& H, const ZZ_pXModulus& F)
	1239
	1240	{
	1241	BasicThreadPool *pool = GetThreadPool();
	1242
	1243	if (!pool \|\| pool->active() \|\| pool->NumThreads() == 1) {
	1244	basic_ProjectPowers(x, a, k, H, F);
	1245	return;
	1246	}
	1247
	1248	long n = F.n;
	1249
	1250	if (a.length() > n \|\| k < 0)
	1251	LogicError("ProjectPowers: bad args");
	1252	if (NTL_OVERFLOW(k, 1, 0))
	1253	ResourceError("ProjectPowers: excessive args");
	1254
	1255
	1256	long m = H.H.length()-1;
	1257	long l = (k+m-1)/m - 1;
	1258
	1259	ZZ_pXMultiplier M;
	1260	build(M, H.H[m], F);
	1261
	1262	vec_ZZ_p s(INIT_SIZE, n);
	1263	s = a;
	1264	StripZeroes(s);
	1265
	1266	x.SetLength(k);
	1267
	1268	ZZ_pContext local_context;
	1269	local_context.save();
	1270
	1271
	1272	for (long i = 0; i <= l; i++) {
	1273	long m1 = min(m, k-i*m);
	1274	ZZ_p* w = &x[i*m];
	1275
	1276	pool->exec_range(m1,
	1277	[w, &H, &s, &local_context](long first, long last) {
	1278	local_context.restore();
	1279	for (long j = first; j < last; j++)
	1280	InnerProduct(w[j], H.H[j].rep, s);
	1281	} );
	1282
	1283
	1284	if (i < l)
	1285	UpdateMap(s, s, M, F);
	1286	}
	1287	}
	1288
	1289
	1290	#endif
1181	1291
1182	1292
1183	1293	void ProjectPowers(vec_ZZ_p& x, const vec_ZZ_p& a, long k,

-6

src/ZZ_pXFactoring.c less more

682	682	return !IsX(s);
683	683	}
684	684
685		NTL_THREAD_LOCAL long ZZ_pX_BlockingFactor = 10;
	685	NTL_CHEAP_THREAD_LOCAL long ZZ_pX_BlockingFactor = 10;
686	686
687	687	void DDF(vec_pair_ZZ_pX_long& factors, const ZZ_pX& ff, const ZZ_pX& hh,
688	688	long verbose)

1466	1466
1467	1467	/*********** NEW DDF **************/
1468	1468
1469		NTL_THREAD_LOCAL long ZZ_pX_GCDTableSize = 4;
1470		NTL_THREAD_LOCAL double ZZ_pXFileThresh = NTL_FILE_THRESH;
1471		NTL_THREAD_LOCAL static vec_ZZ_pX *BabyStepFile = 0;
1472		NTL_THREAD_LOCAL static vec_ZZ_pX *GiantStepFile = 0;
1473		NTL_THREAD_LOCAL static long use_files;
	1469	NTL_CHEAP_THREAD_LOCAL long ZZ_pX_GCDTableSize = 4;
	1470	NTL_CHEAP_THREAD_LOCAL double ZZ_pXFileThresh = NTL_FILE_THRESH;
	1471	static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pX *BabyStepFile = 0;
	1472	static NTL_CHEAP_THREAD_LOCAL vec_ZZ_pX *GiantStepFile = 0;
	1473	static NTL_CHEAP_THREAD_LOCAL long use_files;
1474	1474
1475	1475
1476	1476	static

+98

-11

src/c_lip_impl.h less more

4	4	#include <NTL/SmartPtr.h>
5	5
6	6	NTL_CLIENT
	7
	8	#ifdef NTL_THREADS
	9	#error "NTL_THREADS does not work with classic LIP: use GMP instead"
	10	#endif
7	11
8	12
9	13	#define MustAlloc(c, len) (!(c) \|\| ((c)[-1] >> 1) < (len))

2099	2103	/* signs a and b are different...use _ntl_zsub */
2100	2104
2101	2105	if (anegative) {
	2106	// UNSAFE
2102	2107	// FIXME: this is too ugly
2103	2108	a[0] = -sa;
2104	2109	NTL_SCOPE(guard) { if (!a_alias) a[0] = sa; };

2109	2114	guard.relax();
2110	2115	}
2111	2116	else {
	2117	// UNSAFE
2112	2118	// FIXME: this is too ugly
2113	2119	b[0] = -sb;
2114	2120	NTL_SCOPE(guard) { if (!b_alias) b[0] = sb; };

2235	2241	/* signs of a and b are different...use _ntl_zadd */
2236	2242
2237	2243	if (anegative) {
	2244	// UNSAFE
2238	2245	// FIXME: this is too ugly
2239	2246	a[0] = -sa;
2240	2247	NTL_SCOPE(guard) { if (!a_alias) a[0] = sa; };

2248	2255	c[0] = -c[0];
2249	2256	}
2250	2257	else {
	2258	// UNSAFE
2251	2259	// FIXME: this is too ugly
2252	2260	b[0] = -sb;
2253	2261	NTL_SCOPE(guard) { if (!b_alias) b[0] = sb; };

2320	2328
2321	2329	// EXCEPTIONS: delay assignment to a[0] until after memory allocation,
2322	2330	// the remaining code is exception free
	2331
	2332	// UNSAFE
2323	2333
2324	2334	a[0] = sa;
2325	2335

2612	2622	/* and subtract from T3 */
2613	2623
2614	2624	{
	2625	// UNSAFE
	2626
2615	2627	long olda, oldb;
2616	2628
2617	2629	olda = a[hsa]; a[hsa] = sa-hsa;

2627	2639	/* recursively compute a_lob_lo into low part of c /
2628	2640	/* and subtract from T3 */
2629	2641
	2642	// UNSAFE
	2643
2630	2644	*a = hsa;
2631	2645	*b = hsa;
2632	2646

2652	2666
2653	2667	/* recursively compute ba_hi into high part of c /
2654	2668	{
	2669	// UNSAFE
	2670
2655	2671	long olda;
2656	2672
2657	2673	olda = a[hsa]; a[hsa] = sa-hsa;

2660	2676	}
2661	2677
2662	2678	/* recursively compute ba_lo into T /
	2679
	2680	// UNSAFE
2663	2681
2664	2682	*a = hsa;
2665	2683	kar_mul(T, a, b, stk);

2731	2749	kar_fold(T1, a, hsa);
2732	2750	kar_sq(T2, T1, stk);
2733	2751
	2752	// UNSAFE
	2753
2734	2754	olda = a[hsa]; a[hsa] = sa - hsa;
2735	2755	kar_sq(c + (hsa << 1), a + hsa, stk);
2736	2756	kar_sub(T2, c + (hsa << 1));
2737	2757	a[hsa] = olda;
2738	2758
	2759	// UNSAFE
	2760
2739	2761	*a = hsa;
2740	2762	kar_sq(c, a, stk);
2741	2763	kar_sub(T2, c);

2780	2802	}
2781	2803
2782	2804
	2805
	2806	// UNSAFE
2783	2807
2784	2808	sa = *a;
2785	2809	if (sa < 0) {

3015	3039	a = mem;
3016	3040	}
3017	3041
	3042
	3043	// UNSAFE
	3044
3018	3045	sa = *a;
3019	3046
3020	3047	if (*a < 0) {

3521	3548	return;
3522	3549	}
3523	3550
	3551	// UNSAFE
	3552
3524	3553	sign = 0;
3525	3554	if (sa < 0) {
3526	3555	a[0] = sa = -sa;

3682	3711	_ntl_zintoz(_ntl_zsmod(a, -b[1]), rr);
3683	3712	return;
3684	3713	}
	3714
	3715	// UNSAFE
3685	3716
3686	3717	sign = 0;
3687	3718	if (sa < 0) {

5462	5493	return;
5463	5494	}
5464	5495
	5496	// UNSAFE
5465	5497
5466	5498	if (m1negative = (mm1[0] < 0))
5467	5499	mm1[0] = -mm1[0];

6157	6189	long i;
6158	6190	_ntl_verylong a;
6159	6191	long bitpos, wordpos, bitoffset, diff;
	6192	long nbits;
	6193	unsigned long carry, tmp;
	6194
	6195	while (n > 0 && p[n-1] == 0) n--;
6160	6196
6161	6197	if (n <= 0) {
6162	6198	_ntl_zzero(x);
6163	6199	return;
6164	6200	}
6165	6201
	6202
6166	6203	if (n > (NTL_MAX_LONG-(NTL_NBITS-1))/8)
6167	6204	ResourceError("ZZFromBytes: excessive length");
6168	6205
6169		sz = (n*8 + NTL_NBITS-1)/NTL_NBITS;
	6206	nbits = 0;
	6207	tmp = p[n-1];
	6208	while (tmp) {
	6209	tmp >>= 1;
	6210	nbits++;
	6211	}
	6212
	6213	sz = ((n-1)*8 + nbits + NTL_NBITS-1)/NTL_NBITS;
6170	6214
6171	6215	_ntl_zsetlength(x, sz);
6172	6216

6175	6219	for (i = 1; i <= sz; i++)
6176	6220	a[i] = 0;
6177	6221
	6222	carry = 0;
6178	6223	for (i = 0; i < n; i++) {
6179	6224	bitpos = i*8;
6180	6225	wordpos = bitpos/NTL_NBITS;
6181	6226	bitoffset = bitpos - wordpos*NTL_NBITS;
6182	6227	diff = NTL_NBITS-bitoffset;
6183	6228
6184		if (diff < 8) {
6185		a[wordpos+1] \|=
	6229	a[wordpos+1] \|= carry \|
6186	6230	((( ((unsigned long)(p[i])) & 255UL ) << bitoffset) & NTL_RADIXM);
6187		a[wordpos+2] = ( ((long)(p[i])) & 255 ) >> diff;
6188		}
6189		else {
6190		a[wordpos+1] \|= (( ((long)(p[i])) & 255 ) << bitoffset);
6191		}
6192		}
6193
6194		while (sz > 1 && a[sz] == 0) sz--;
	6231
	6232	carry = ( ((unsigned long)(p[i])) & 255UL ) >> diff;
	6233	}
	6234
	6235	a[sz] \|= carry;
6195	6236	a[0] = sz;
6196	6237	}
6197	6238

6681	6722	}
6682	6723
6683	6724
	6725	// boilerplate to provide compatible interface
	6726	class _ntl_reduce_struct_plain : public _ntl_reduce_struct {
	6727	public:
	6728	_ntl_verylong_wrapped N;
	6729
	6730	void eval(_ntl_verylong rres, _ntl_verylong TT)
	6731	{
	6732	_ntl_zmod(*TT, N, rres);
	6733	}
	6734
	6735	void adjust(_ntl_verylong *x) { }
	6736	};
	6737
	6738	_ntl_reduce_struct *
	6739	_ntl_reduce_struct_build(_ntl_verylong modulus, _ntl_verylong excess)
	6740	{
	6741	UniquePtr<_ntl_reduce_struct_plain> C;
	6742	C.make();
	6743
	6744	_ntl_zcopy(modulus, &C->N);
	6745
	6746	return C.release();
	6747	}
	6748
	6749
	6750
	6751
	6752	// general preconditioned remainder
	6753
	6754	class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
	6755	};
	6756
	6757	_ntl_general_rem_one_struct *
	6758	_ntl_general_rem_one_struct_build(long p, long sz)
	6759	{
	6760	return 0;
	6761	}
	6762
	6763	long
	6764	_ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
	6765	{
	6766	return _ntl_zsmod(a, p);
	6767	}
	6768
	6769
	6770

+98

-25

src/cfile less more

82	82
83	83	#endif
84	84
	85	#if @{NTL_DISABLE_TLS_HACK}
	86	#define NTL_DISABLE_TLS_HACK
	87
	88	/* Set if you want to compile NTL without "TLS hack"
	89	*
	90	* To re-build after changing this flag: rm *.o; make ntl.a
	91	*/
	92
	93	#endif
	94
	95	#if @{NTL_ENABLE_TLS_HACK}
	96	#define NTL_ENABLE_TLS_HACK
	97
	98	/* Set if you want to compile NTL with "TLS hack"
	99	*
	100	* To re-build after changing this flag: rm *.o; make ntl.a
	101	*/
	102
	103	#endif
	104
85	105	#if @{NTL_THREADS}
86	106	#define NTL_THREADS
87	107

103	123
104	124	#endif
105	125
	126	#if @{NTL_THREAD_BOOST}
	127	#define NTL_THREAD_BOOST
	128
	129	/* Set if you want to compile NTL to exploit threads internally.
	130	*
	131	* To re-build after changing this flag: rm *.o; make ntl.a
	132	*/
	133
	134	#endif
	135	#
106	136
107	137	#if @{NTL_GMP_LIP}
108	138	#define NTL_GMP_LIP

147	177
148	178	#endif
149	179
150		#if @{NTL_PCLMUL}
151		#define NTL_PCLMUL
152
153		/*
154		* Use this flag for faster GF2X arithmetc.
155		* This enables the use of the PCLMUL instruction on x86-64
156		* machines.
157		*
158		* To re-build after changing this flag:
159		* rm GF2X.o; make ntl.a
160		*/
161
162		#endif
163	180
164	181	#if @{FLAG_LONG_LONG_TYPE}
165	182	#define NTL_LONG_LONG_TYPE @{NTL_LONG_LONG_TYPE}

300	317	#if @{NTL_DISABLE_LONGDOUBLE}
301	318	#define NTL_DISABLE_LONGDOUBLE
302	319
303		/* Explicitly disables us of long double arithmetic in the
304		* single-precision modular arithmetic routines
	320	/* Explicitly disables us of long double arithmetic
305	321	*/
306	322
307	323	#endif

310	326	#if @{NTL_DISABLE_LONGLONG}
311	327	#define NTL_DISABLE_LONGLONG
312	328
313		/* Explicitly disables us of long long arithmetic in the
314		* single-precision modular arithmetic routines
315		*/
316
317		#endif
318
319
	329	/* Explicitly disables us of long long arithmetic
	330	*/
	331
	332	#endif
	333
	334	#if @{NTL_DISABLE_LL_ASM}
	335	#define NTL_DISABLE_LL_ASM
	336
	337	/* Explicitly disables us of inline assembly as a replacement
	338	* for long lobg arithmetic.
	339	*/
	340
	341	#endif
	342
	343
	344	#if @{NTL_MAXIMIZE_SP_NBITS}
	345	#define NTL_MAXIMIZE_SP_NBITS
	346
	347	/* Allows for 62-bit single-precision moduli on 64-bit platforms.
	348	* By default, such moduli are restricted to 60 bits, which
	349	* usually gives slightly better performance across a range of
	350	* of parameters.
	351	*/
	352
	353	#endif
320	354
321	355	/*************************************************************************
322	356	*

505	539	#endif
506	540
507	541
	542	#if @{NTL_CRT_ALTCODE}
	543	#define NTL_CRT_ALTCODE
	544
	545	/*
	546	* Employs an alternative CRT strategy.
	547	* Only relevant with GMP.
	548	* Seems to be marginally faster on some x86_64 platforms.
	549	*
	550	* To re-build after changing this flag:
	551	* rm lip.o; make ntl.a
	552	*/
	553
	554	#endif
	555
	556	#if @{NTL_CRT_ALTCODE_SMALL}
	557	#define NTL_CRT_ALTCODE_SMALL
	558
	559	/*
	560	* Employs an alternative CRT strategy for small moduli.
	561	* Only relevant with GMP.
	562	* Seems to be marginally faster on some x86_64 platforms.
	563	*
	564	* To re-build after changing this flag:
	565	* rm lip.o; make ntl.a
	566	*/
	567
	568	#endif
	569
508	570
509	571	#if @{NTL_GF2X_ALTCODE}
510	572	#define NTL_GF2X_ALTCODE

547	609	#endif
548	610
549	611
	612	#if @{NTL_PCLMUL}
	613	#define NTL_PCLMUL
	614
	615	/*
	616	* Use this flag for faster GF2X arithmetc.
	617	* This enables the use of the PCLMUL instruction on x86-64
	618	* machines.
	619	*
	620	* To re-build after changing this flag:
	621	* rm GF2X.o; make ntl.a
	622	*/
	623
	624	#endif
550	625
551	626
552	627
553	628	@{WIZARD_HACK}
554	629
555	630
556
557
558		#endif
	631	#endif

-18

src/configure less more

6	6	# Also, some shells do not handle "$@" correctly when
7	7	# no options are supplied, so this is handled as a special case.
8	8
9		ARGS=""
10		rm -f RETRY_CONFIG
	9
	10
11	11
12	12	if test $# -ne 0
13	13	then

16	16	perl DoConfig
17	17	fi
18	18
19		while test -f RETRY_CONFIG
20		do
21		ARGS="$ARGS `cat RETRY_CONFIG`"
22		rm RETRY_CONFIG
23
24		if test $# -ne 0
25		then
26		perl DoConfig "$@" $ARGS
27		else
28		perl DoConfig $ARGS
29		fi
30
31		done
32
33
34

-11

src/ctools.c less more

20	20	* side effect of forcing its argument into memory.
21	21	*/
22	22
23		NTL_THREAD_LOCAL volatile double _ntl_IsFinite__local;
24		NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr1 = &_ntl_IsFinite__local;
25		NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr2 = &_ntl_IsFinite__local;
26		NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr3 = &_ntl_IsFinite__local;
27		NTL_THREAD_LOCAL volatile double *_ntl_IsFinite__ptr4 = &_ntl_IsFinite__local;
	23	NTL_CHEAP_THREAD_LOCAL volatile double _ntl_IsFinite__local = 0;
28	24
29	25	long _ntl_IsFinite(double *p)
30	26	{
31		_ntl_IsFinite__ptr1 = p;
32		_ntl_IsFinite__ptr3 = (_ntl_IsFinite__ptr2 - *p);
33		if (*_ntl_IsFinite__ptr4 != 0.0) return 0;
	27	_ntl_IsFinite__local = *p;
	28	double x1 = _ntl_IsFinite__local;
	29	double x2 = _ntl_IsFinite__local;
	30	double x3 = x1-x2;
	31	if (x3 != 0.0) return 0;
34	32	return 1;
35	33	}
36	34

47	45
48	46	void _ntl_ForceToMem(double *p)
49	47	{
50		_ntl_IsFinite__ptr1 = p;
51		p = _ntl_IsFinite__ptr2;
	48	_ntl_IsFinite__local = *p;
	49	*p = _ntl_IsFinite__local;
52	50	}
53	51
54	52

75	73	* overly-agressive optimizing compilers from screwing things up.
76	74	*/
77	75
78		NTL_THREAD_LOCAL volatile double _ntl_ldexp_zero = 0.0;
	76	NTL_CHEAP_THREAD_LOCAL volatile double _ntl_ldexp_zero = 0.0;
79	77
80	78	double _ntl_ldexp(double x, long e)
81	79	{

+46

-32

src/def_makefile less more

12	12
13	13	CXXFLAGS=-g -O2
14	14	# Flags for the C++ compiler
	15
	16	CXXAUTOFLAGS=
	17	# Flags for the C++ compiler, automatically generated by configuration script
15	18
16	19
17	20	AR=ar

69	72
70	73	GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
71	74	GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
72		GMP_OPT_LIB=# -lgmp # GMP
	75	GMP_OPT_LIB=-lgmp # GMP
73	76	# uncomment these if using GMP
74	77
75	78

136	139	O16=$(O15)
137	140	O17=$(O16)
138	141	O18=$(O17) xdouble.o
139		O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
	142	O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140	143
141	144	OBJ=$(O19)
142	145

161	164	S16=$(S15)
162	165	S17=$(S16)
163	166	S18=$(S17) xdouble.c
164		S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
	167	S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165	168
166	169	SRC = $(S19)
167	170

193	196	IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194	197	IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195	198	IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196		IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197		IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199		INCL=$(IN20)
	199	IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
	200	INCL=$(IN19)
200	201
201	202
202	203

212	213	# test source files
213	214
214	215	TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215		TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
	216	TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216	217	TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217	218	TS4=$(TS3) ThreadTest.c ExceptionTest.c
218	219	TS = $(TS4)
219	220
220	221	# scripts
221	222
222		SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
	223	SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223	224	SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224	225
225	226	SCRIPTS=$(SCRIPTS2)
226	227
227	228	# auxilliary source
228	229
229		MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230		GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
	230	MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
	231	GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231	232	GP=GetPID1.c GetPID2.c TestGetPID.c
232		CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
	233	CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
	234
	235	AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233	236
234	237
235	238
236	239	# documentation
237	240
238	241
239		D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
	242	D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240	243	D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241	244	D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242	245	D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt

252	255	D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253	256	D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254	257	D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255		D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
	258	D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256	259	D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257	260
258	261	TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt

263	266	TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264	267	TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265	268	TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266		TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
	269	TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267	270
268	271	TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269	272

275	278	HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276	279	HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277	280	HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278		HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
	281	HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279	282
280	283	HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281	284

287	290	# test program executables
288	291
289	292	PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290		PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
	293	PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291	294	PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292	295	PROGS = $(PROG3)
293	296
294	297	# things to save to a tar file
295	298
296	299	SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297		SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
	300	SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298	301	SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299	302	SFILES=$(SFI3)
300	303

309	312	NTL_INCLUDE = -I../include -I.
310	313	# NTL needs this to find its include files
311	314
312		COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314		LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
	315	COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
	316
	317	LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315	318
316	319
317	320

341	344	# setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342	345
343	346	setup2:
	347	echo "* CheckFeature log *" > CheckFeature.log
344	348	sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345	349	sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346		sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347		sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
	350	sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
	351	sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
	352	sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
	353	sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348	354
349	355	# setup3 generates the file ../include/NTL/gmp_aux.h
350	356	# The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h

382	388	GetPID.o: GetPID.c
383	389	$(LCOMP) $(COMPILE) GetPID.c
384	390
385		CheckPCLMUL: CheckPCLMUL.c
386		$(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
	391	CheckCompile: CheckCompile.c
	392	$(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
	393
387	394
388	395	.c.o:
389	396	$(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<

460	467
461	468	clobber:
462	469	rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463		cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464		cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
	470	sh ResetFeatures '..'
465	471	rm -f ../include/NTL/gmp_aux.h
466		sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
	472	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467	473	rm -f *.o
468	474	rm -rf small
469	475	rm -f cfileout mfileout

471	477	rm -f all
472	478
473	479	clean:
474		sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
	480	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475	481	rm -f *.o
476	482	rm -rf small
477	483	# - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR

497	503
498	504
499	505	package:
	506	./configure --nowrite
	507	cp mfileout def_makefile
	508	cp cfileout ../include/NTL/def_config.h
500	509	sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501	510	rm -rf `cat DIRNAME`
502	511	rm -f `cat DIRNAME`.tar

508	517	rm -rf `cat DIRNAME`
509	518
510	519	winpack:
	520	./configure --nowrite NTL_GMP_LIP=off
	521	cp mfileout def_makefile
	522	cp cfileout ../include/NTL/def_config.h
511	523	sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512	524	rm -rf `cat WINDIR`
513	525	rm -f `cat WINDIR`.zip

526	538
527	539	WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528	540	WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529		WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
	541	WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530	542
531	543	WOBJ = $(WO3)
532	544

538	550	MulTimeTest:
539	551	$(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540	552
541		PolyTimeTest:
542		$(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543	553
544	554	Poly1TimeTest:
545	555	$(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	556	Poly2TimeTest:
	557	$(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	558	Poly3TimeTest:
	559	$(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546	560
547	561
548	562	GF2XTimeTest:

-2

src/dosify less more

20	20	cp mach_desc.win dos/include/NTL/mach_desc.h
21	21
22	22
	23	cp GetTime0.c dos/GetTime/GetTime0.cpp
23	24	cp GetTime1.c dos/GetTime/GetTime1.cpp
24	25	cp GetTime2.c dos/GetTime/GetTime2.cpp
25	26	cp GetTime3.c dos/GetTime/GetTime3.cpp

60	61	cp $6 dos/src
61	62
62	63	cp ../include/NTL/def_config.h dos/include/NTL/config.h
63		cp ../include/NTL/have_LL_no.h dos/include/NTL/have_LL.h
64		cp ../include/NTL/have_builtin_clzl_no.h dos/include/NTL/have_builtin_clzl.h
	64	sh ResetFeatures dos
65	65
66	66

-6

src/fileio.c less more

81	81
82	82	const char FileName(const char stem, long d)
83	83	{
84		NTL_THREAD_LOCAL static string sbuf;
	84	NTL_TLS_LOCAL(string, sbuf);
85	85
86	86	stringstream ss;
87	87	ss << "tmp-ntl-" << stem;

111	111	static AtomicCounter cnt; // a GLOBAL counter
112	112
113	113
114		NTL_THREAD_LOCAL static string ID;
115		NTL_THREAD_LOCAL static bool initialized = false;
116		NTL_THREAD_LOCAL static unsigned long local_cnt = cnt.inc();
117		NTL_THREAD_LOCAL static unsigned long local_time = time(0);
118		NTL_THREAD_LOCAL static unsigned long local_clock = clock();
	114	NTL_TLS_LOCAL(string, ID);
	115
	116	NTL_TLS_LOCAL_INIT(bool, initialized, (false));
	117	NTL_TLS_LOCAL_INIT(unsigned long, local_cnt, (cnt.inc()));
	118	NTL_TLS_LOCAL_INIT(unsigned long, local_time, (time(0)));
	119	NTL_TLS_LOCAL_INIT(unsigned long, local_clock, (clock()));
119	120
120	121	if (!initialized) {
121	122	stringstream ss;

+1085

-112

src/g_lip_impl.h less more

342	342
343	343
344	344
	345	#if (defined(NTL_HAVE_LL_TYPE) && NTL_ZZ_NBITS == NTL_BITS_PER_LONG)
	346	#define NTL_VIABLE_LL
	347	#endif
	348
	349	#if (defined(NTL_CRT_ALTCODE) \|\| defined(NTL_CRT_ALTCODE_SMALL))
	350	#define NTL_TBL_CRT
	351	#endif
	352
	353
345	354
346	355	class _ntl_gbigint_watcher {
347	356	public:

394	403	// this logic onto what was originally pure-C code.
395	404
396	405
397		#define GRegister(x) NTL_THREAD_LOCAL static _ntl_gbigint_wrapped x; _ntl_gbigint_watcher _WATCHER__ ## x(&x)
	406	#define GRegister(x) NTL_TLS_LOCAL(_ntl_gbigint_wrapped, x); _ntl_gbigint_watcher _WATCHER__ ## x(&x)
398	407
399	408	// #define GRegister(x) NTL_THREAD_LOCAL static _ntl_gbigint x(0); _ntl_gbigint_watcher _WATCHER__ ## x(&x)
400	409

1326	1335
1327	1336	GET_SIZE_NEG(sn, nneg, n);
1328	1337
1329		limb_cnt = k/NTL_ZZ_NBITS;
1330		k %= NTL_ZZ_NBITS;
	1338	limb_cnt = ((unsigned long) k) / NTL_ZZ_NBITS;
	1339	k = ((unsigned long) k) % NTL_ZZ_NBITS;
1331	1340	sres = sn + limb_cnt;
1332	1341	if (k != 0) sres++;
1333	1342

1385	1394
1386	1395	GET_SIZE_NEG(sn, nneg, n);
1387	1396
1388		limb_cnt = k/NTL_ZZ_NBITS;
	1397	limb_cnt = ((unsigned long) k) / NTL_ZZ_NBITS;
1389	1398
1390	1399	sres = sn - limb_cnt;
1391	1400

1404	1413	ndata = DATA(n);
1405	1414	resdata = DATA(res);
1406	1415	ndata1 = ndata + limb_cnt;
1407		k %= NTL_ZZ_NBITS;
	1416	k = ((unsigned long) k) % NTL_ZZ_NBITS;
1408	1417
1409	1418	if (k != 0) {
1410	1419	mpn_rshift(resdata, ndata1, sres, k);

1522	1531	void
1523	1532	_ntl_gsadd(_ntl_gbigint a, long b, _ntl_gbigint *cc)
1524	1533	{
	1534	// FIXME: this is really inefficient...too much overhead
1525	1535	GRegister(B);
1526	1536	_ntl_gintoz(b, &B);
1527	1537	_ntl_gadd(a, B, cc);

2910	2920	{
2911	2921	GRegister(tmp);
2912	2922
2913		NTL_THREAD_LOCAL static double log_2;
2914		NTL_THREAD_LOCAL static long init = 0;
	2923	static const double log_2 = log(2.0); // GLOBAL (assumes C++11 thread-safe init)
2915	2924
2916	2925	long s;
2917	2926	long shamt;
2918	2927	long correction;
2919	2928	double x;
2920
2921		if (!init) {
2922		log_2 = log(2.0);
2923		init = 1;
2924		}
2925	2929
2926	2930	if (_ntl_gsign(n) <= 0)
2927	2931	ArithmeticError("log argument <= 0");

3502	3506	for (i = 0; i < m; i++) {
3503	3507	q = Tdata[i]*inv;
3504	3508	d = mpn_addmul_1(Tdata+i, Ndata, n, q);
	3509
	3510	// (c, Tdata[i+n]) = c + d + Tdata[i+n]
3505	3511	t = Tdata[i+n] + d;
3506	3512	Tdata[i+n] = t + c;
3507	3513	if (t < d \|\| (c == 1 && t + c == 0))

3523	3529
3524	3530	SIZE(res) = i;
3525	3531	SIZE(T) = 0;
	3532	}
	3533
	3534
	3535	// This montgomery code is for external consumption...
	3536	// This is currently used in the CRT reconstruction step
	3537	// for ZZ_pX arithmetic. It gives a nontrivial speedup
	3538	// for smallish p (up to a few hundred bits)
	3539
	3540	class _ntl_reduce_struct_montgomery : public _ntl_reduce_struct {
	3541	public:
	3542	long m;
	3543	mp_limb_t inv;
	3544	_ntl_gbigint_wrapped N;
	3545
	3546	void eval(_ntl_gbigint rres, _ntl_gbigint TT);
	3547	void adjust(_ntl_gbigint *x);
	3548	};
	3549
	3550
	3551
	3552	// DIRT: may not work with non-empty "nails"
	3553
	3554	void _ntl_reduce_struct_montgomery::eval(_ntl_gbigint rres, _ntl_gbigint TT)
	3555	{
	3556	long n, sT, i;
	3557	mp_limb_t Ndata, Tdata, *resdata, q, d, t, c;
	3558	_ntl_gbigint res, T;
	3559
	3560
	3561	T = *TT;
	3562
	3563	// quick zero test, in case of sparse polynomials
	3564	if (ZEROP(T)) {
	3565	_ntl_gzero(rres);
	3566	return;
	3567	}
	3568
	3569	n = SIZE(N);
	3570	Ndata = DATA(N);
	3571
	3572	if (MustAlloc(T, m+n)) {
	3573	_ntl_gsetlength(&T, m+n);
	3574	*TT = T;
	3575	}
	3576
	3577	res = *rres;
	3578	if (MustAlloc(res, n)) {
	3579	_ntl_gsetlength(&res, n);
	3580	*rres = res;
	3581	}
	3582
	3583	sT = SIZE(T);
	3584	Tdata = DATA(T);
	3585	resdata = DATA(res);
	3586
	3587	for (i = sT; i < m+n; i++)
	3588	Tdata[i] = 0;
	3589
	3590	c = 0;
	3591	for (i = 0; i < m; i++) {
	3592	q = Tdata[i]*inv;
	3593	d = mpn_addmul_1(Tdata+i, Ndata, n, q);
	3594
	3595	// (c, Tdata[i+n]) = c + d + Tdata[i+n]
	3596	t = Tdata[i+n] + d;
	3597	Tdata[i+n] = t + c;
	3598	if (t < d \|\| (c == 1 && t + c == 0))
	3599	c = 1;
	3600	else
	3601	c = 0;
	3602	}
	3603
	3604	if (c \|\| mpn_cmp(Tdata + m, Ndata, n) >= 0) {
	3605	mpn_sub_n(resdata, Tdata + m, Ndata, n);
	3606	}
	3607	else {
	3608	for (i = 0; i < n; i++)
	3609	resdata[i] = Tdata[m + i];
	3610	}
	3611
	3612	i = n;
	3613	STRIP(i, resdata);
	3614
	3615	SIZE(res) = i;
	3616	SIZE(T) = 0;
	3617	}
	3618
	3619	// this will adjust the given number by multiplying by the
	3620	// montgomery scaling factor
	3621
	3622	void _ntl_reduce_struct_montgomery::adjust(_ntl_gbigint *x)
	3623	{
	3624	GRegister(tmp);
	3625	_ntl_glshift(x, mNTL_ZZ_NBITS, &tmp);
	3626	_ntl_gmod(tmp, N, x);
	3627	}
	3628
	3629
	3630
	3631
	3632	class _ntl_reduce_struct_plain : public _ntl_reduce_struct {
	3633	public:
	3634	_ntl_gbigint_wrapped N;
	3635
	3636	void eval(_ntl_gbigint rres, _ntl_gbigint TT)
	3637	{
	3638	_ntl_gmod(*TT, N, rres);
	3639	}
	3640
	3641	void adjust(_ntl_gbigint *x) { }
	3642	};
	3643
	3644	// assumption: all values passed to eval for montgomery reduction
	3645	// are in [0, modulus*excess]
	3646
	3647	_ntl_reduce_struct *
	3648	_ntl_reduce_struct_build(_ntl_gbigint modulus, _ntl_gbigint excess)
	3649	{
	3650	if (_ntl_godd(modulus)) {
	3651	UniquePtr<_ntl_reduce_struct_montgomery> C;
	3652	C.make();
	3653
	3654	C->m = _ntl_gsize(excess);
	3655	C->inv = neg_inv_mod_limb(DATA(modulus)[0]);
	3656	_ntl_gcopy(modulus, &C->N);
	3657
	3658	return C.release();
	3659	}
	3660	else {
	3661	UniquePtr<_ntl_reduce_struct_plain> C;
	3662	C.make();
	3663
	3664	_ntl_gcopy(modulus, &C->N);
	3665
	3666	return C.release();
	3667	}
3526	3668	}
3527	3669
3528	3670

3839	3981
3840	3982	void _ntl_gfrombytes(_ntl_gbigint x, const unsigned char p, long n)
3841	3983	{
3842		long BytesPerLimb;
3843	3984	long lw, r, i, j;
3844	3985	mp_limb_t *xp, t;
3845	3986
	3987	while (n > 0 && p[n-1] == 0) n--;
	3988
3846	3989	if (n <= 0) {
3847		x = 0;
	3990	_ntl_gzero(x);
3848	3991	return;
3849	3992	}
3850	3993
3851		BytesPerLimb = NTL_ZZ_NBITS/8;
	3994	const long BytesPerLimb = NTL_ZZ_NBITS/8;
3852	3995
3853	3996
3854	3997	lw = n/BytesPerLimb;

3882	4025	t >>= (BytesPerLimb-r)*8;
3883	4026	xp[lw-1] = t;
3884	4027
3885		STRIP(lw, xp);
	4028	// strip not necessary here
	4029	// STRIP(lw, xp);
3886	4030	SIZE(*x) = lw;
3887	4031	}
3888	4032

3892	4036
3893	4037	void _ntl_gbytesfromz(unsigned char *p, _ntl_gbigint a, long n)
3894	4038	{
3895		long BytesPerLimb;
3896	4039	long lbits, lbytes, min_bytes, min_words, r;
3897	4040	long i, j;
3898	4041	mp_limb_t *ap, t;
3899	4042
3900	4043	if (n < 0) n = 0;
3901	4044
3902		BytesPerLimb = NTL_ZZ_NBITS/8;
	4045	const long BytesPerLimb = NTL_ZZ_NBITS/8;
3903	4046
3904	4047	lbits = _ntl_g2log(a);
3905	4048	lbytes = (lbits+7)/8;

4174	4317	void eval(_ntl_gbigint x, const long b, _ntl_tmp_vec *tmp_vec);
4175	4318	};
4176	4319
	4320
	4321	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	4322
	4323	class _ntl_crt_struct_tbl : public _ntl_crt_struct {
	4324	public:
	4325	Unique2DArray<mp_limb_t> v;
	4326	long n;
	4327	long sz;
	4328
	4329	bool special();
	4330	void insert(long i, _ntl_gbigint m);
	4331	_ntl_tmp_vec *extract();
	4332	_ntl_tmp_vec *fetch();
	4333	void eval(_ntl_gbigint x, const long b, _ntl_tmp_vec *tmp_vec);
	4334
	4335	};
	4336
	4337	#endif
	4338
	4339
	4340
	4341
4177	4342	class _ntl_crt_struct_fast : public _ntl_crt_struct {
4178	4343	public:
4179	4344	long n;

4307	4472	return C.release();
4308	4473	}
4309	4474
	4475
	4476	#if (defined(NTL_VIABLE_LL))
	4477
	4478	// alternative CRT code is viable
	4479
	4480	#if (defined(NTL_CRT_ALTCODE))
	4481	// unconditionally use the alternative code,
	4482	// as the tuning wizard says its preferable for larger moduli
	4483
	4484	{
	4485	UniquePtr<_ntl_crt_struct_tbl> C;
	4486	C.make();
	4487	C->n = n;
	4488	C->sz = SIZE(p);
	4489	C->v.SetDims(C->sz, C->n);
	4490
	4491	return C.release();
	4492	}
	4493	#elif (defined(NTL_CRT_ALTCODE_SMALL))
	4494	// use the alternative code on "smaller" moduli...
	4495	// For now, this triggers when n <= 16.
	4496	// Unless the "long long" compiler support is really bad,
	4497	// this should be a marginal win, as it avoids some
	4498	// procedure call overhead.
	4499
	4500	if (n <= 16) {
	4501	UniquePtr<_ntl_crt_struct_tbl> C;
	4502	C.make();
	4503	C->n = n;
	4504	C->sz = SIZE(p);
	4505	C->v.SetDims(C->sz, C->n);
	4506
	4507	return C.release();
	4508	}
	4509	else {
	4510	UniquePtr<_ntl_crt_struct_basic> C;
	4511	C.make();
	4512
	4513	long i;
	4514
	4515	C->n = n;
	4516	C->v.SetLength(n);
	4517	C->sbuf = SIZE(p)+2;
	4518
	4519	return C.release();
	4520	}
	4521	#else
4310	4522	{
4311	4523	UniquePtr<_ntl_crt_struct_basic> C;
4312	4524	C.make();

4319	4531
4320	4532	return C.release();
4321	4533	}
	4534	#endif
	4535
	4536	#else
	4537	{
	4538	UniquePtr<_ntl_crt_struct_basic> C;
	4539	C.make();
	4540
	4541	long i;
	4542
	4543	C->n = n;
	4544	C->v.SetLength(n);
	4545	C->sbuf = SIZE(p)+2;
	4546
	4547	return C.release();
	4548	}
	4549	#endif
	4550
4322	4551	}
4323	4552
4324	4553	/* extracts existing tmp_vec, if possible -- read/write operation */

4327	4556	{
4328	4557	return 0;
4329	4558	}
	4559
	4560	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	4561	_ntl_tmp_vec *_ntl_crt_struct_tbl::extract()
	4562	{
	4563	return 0;
	4564	}
	4565	#endif
4330	4566
4331	4567	_ntl_tmp_vec *_ntl_crt_struct_fast::extract()
4332	4568	{

4343	4579	{
4344	4580	return 0;
4345	4581	}
	4582
	4583	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	4584	_ntl_tmp_vec *_ntl_crt_struct_tbl::fetch()
	4585	{
	4586	return 0;
	4587	}
	4588	#endif
4346	4589
4347	4590	_ntl_tmp_vec *_ntl_crt_struct_fast::fetch()
4348	4591	{

4362	4605	{
4363	4606	_ntl_gcopy(m, &v[i]);
4364	4607	}
	4608
	4609	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	4610	void _ntl_crt_struct_tbl::insert(long i, _ntl_gbigint m)
	4611	{
	4612	if (i < 0 \|\| i >= n) LogicError("insert: bad args");
	4613
	4614	if (!m)
	4615	for (long j = 0; j < sz; j++) v[j][i] = 0;
	4616	else {
	4617	long sm = SIZE(m);
	4618	if (sm < 0 \|\| sm > sz) LogicError("insert: bad args");
	4619	const mp_limb_t *mdata = DATA(m);
	4620	for (long j = 0; j < sm; j++)
	4621	v[j][i] = mdata[j];
	4622	for (long j = sm; j < sz; j++)
	4623	v[j][i] = 0;
	4624	}
	4625	}
	4626	#endif
4365	4627
4366	4628	void _ntl_crt_struct_fast::insert(long i, _ntl_gbigint m)
4367	4629	{

4453	4715	SIZE(x1) = sx;
4454	4716	}
4455	4717
	4718
	4719	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	4720
	4721	#define CRT_ALTCODE_UNROLL (1)
	4722
	4723	void _ntl_crt_struct_tbl::eval(_ntl_gbigint x, const long b, _ntl_tmp_vec *generic_tmp_vec)
	4724	{
	4725	long sx;
	4726	_ntl_gbigint x1;
	4727	long i, j;
	4728
	4729	// quick test for zero vector
	4730	// most likely, they are either all zero (if we are working
	4731	// with some sparse polynomials) or none of them are zero,
	4732	// so in the general case, this should go fast
	4733	if (!b[0]) {
	4734	i = 1;
	4735	while (i < n && !b[i]) i++;
	4736	if (i >= n) {
	4737	_ntl_gzero(x);
	4738	return;
	4739	}
	4740	}
	4741
	4742	sx = sz + 2;
	4743	_ntl_gsetlength(x, sx);
	4744	x1 = *x;
	4745	mp_limb_t * NTL_RESTRICT xx = DATA(x1);
	4746
	4747
	4748	const long Bnd = 1L << (NTL_BITS_PER_LONG-NTL_SP_NBITS);
	4749
	4750	if (n <= Bnd) {
	4751	mp_limb_t carry=0;
	4752
	4753	for (i = 0; i < sz; i++) {
	4754	const mp_limb_t *row = v[i];
	4755
	4756	ll_type acc;
	4757	ll_mul(acc, row[0], b[0]);
	4758
	4759	#if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	4760	switch (n) {
	4761	case 16: ll_mul_add(acc, row[16-1], b[16-1]);
	4762	case 15: ll_mul_add(acc, row[15-1], b[15-1]);
	4763	case 14: ll_mul_add(acc, row[14-1], b[14-1]);
	4764	case 13: ll_mul_add(acc, row[13-1], b[13-1]);
	4765	case 12: ll_mul_add(acc, row[12-1], b[12-1]);
	4766	case 11: ll_mul_add(acc, row[11-1], b[11-1]);
	4767	case 10: ll_mul_add(acc, row[10-1], b[10-1]);
	4768	case 9: ll_mul_add(acc, row[9-1], b[9-1]);
	4769	case 8: ll_mul_add(acc, row[8-1], b[8-1]);
	4770	case 7: ll_mul_add(acc, row[7-1], b[7-1]);
	4771	case 6: ll_mul_add(acc, row[6-1], b[6-1]);
	4772	case 5: ll_mul_add(acc, row[5-1], b[5-1]);
	4773	case 4: ll_mul_add(acc, row[4-1], b[4-1]);
	4774	case 3: ll_mul_add(acc, row[3-1], b[3-1]);
	4775	case 2: ll_mul_add(acc, row[2-1], b[2-1]);
	4776	}
	4777	#else
	4778	for (j = 1; j < n; j++)
	4779	ll_mul_add(acc, row[j], b[j]);
	4780	#endif
	4781
	4782	ll_add(acc, carry);
	4783	xx[i] = ll_get_lo(acc);
	4784	carry = ll_get_hi(acc);
	4785	}
	4786
	4787	xx[sz] = carry;
	4788	xx[sz+1] = 0;
	4789	}
	4790	else {
	4791	ll_type carry;
	4792	ll_init(carry, 0);
	4793
	4794	for (i = 0; i < sz; i++) {
	4795	const mp_limb_t *row = v[i];
	4796
	4797	ll_type acc21;
	4798	mp_limb_t acc0;
	4799
	4800	{
	4801	ll_type sum;
	4802	ll_mul(sum, row[0], b[0]);
	4803
	4804	#if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	4805	ll_mul_add(sum, row[1], b[1]);
	4806	ll_mul_add(sum, row[2], b[2]);
	4807	ll_mul_add(sum, row[3], b[3]);
	4808	ll_mul_add(sum, row[4], b[4]);
	4809	ll_mul_add(sum, row[5], b[5]);
	4810	ll_mul_add(sum, row[6], b[6]);
	4811	ll_mul_add(sum, row[7], b[7]);
	4812	ll_mul_add(sum, row[8], b[8]);
	4813	ll_mul_add(sum, row[9], b[9]);
	4814	ll_mul_add(sum, row[10], b[10]);
	4815	ll_mul_add(sum, row[11], b[11]);
	4816	ll_mul_add(sum, row[12], b[12]);
	4817	ll_mul_add(sum, row[13], b[13]);
	4818	ll_mul_add(sum, row[14], b[14]);
	4819	ll_mul_add(sum, row[15], b[15]);
	4820	#elif (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 2)
	4821	ll_mul_add(sum, row[1], b[1]);
	4822	ll_mul_add(sum, row[2], b[2]);
	4823	ll_mul_add(sum, row[3], b[3]);
	4824	#else
	4825	for (j = 1; j < Bnd; j++)
	4826	ll_mul_add(sum, row[j], b[j]);
	4827	#endif
	4828
	4829
	4830	ll_init(acc21, ll_get_hi(sum));
	4831	acc0 = ll_get_lo(sum);
	4832	}
	4833
	4834	const mp_limb_t *ap = row;
	4835	const long *tp = b;
	4836
	4837	#if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 2)
	4838	long m = n - 4;
	4839	ap += 4;
	4840	tp += 4;
	4841
	4842	for (; m >= 8; m -= 8, ap += 8, tp += 8) {
	4843	{
	4844	ll_type sum;
	4845	ll_mul(sum, ap[0], tp[0]);
	4846	ll_mul_add(sum, ap[1], tp[1]);
	4847	ll_mul_add(sum, ap[2], tp[2]);
	4848	ll_mul_add(sum, ap[3], tp[3]);
	4849
	4850	ll_add(sum, acc0);
	4851	acc0 = ll_get_lo(sum);
	4852	ll_add(acc21, ll_get_hi(sum));
	4853	}
	4854	{
	4855	ll_type sum;
	4856	ll_mul(sum, ap[4+0], tp[4+0]);
	4857	ll_mul_add(sum, ap[4+1], tp[4+1]);
	4858	ll_mul_add(sum, ap[4+2], tp[4+2]);
	4859	ll_mul_add(sum, ap[4+3], tp[4+3]);
	4860
	4861	ll_add(sum, acc0);
	4862	acc0 = ll_get_lo(sum);
	4863	ll_add(acc21, ll_get_hi(sum));
	4864	}
	4865	}
	4866
	4867	for (; m >= 4; m -= 4, ap += 4, tp += 4) {
	4868	ll_type sum;
	4869	ll_mul(sum, ap[0], tp[0]);
	4870	ll_mul_add(sum, ap[1], tp[1]);
	4871	ll_mul_add(sum, ap[2], tp[2]);
	4872	ll_mul_add(sum, ap[3], tp[3]);
	4873
	4874	ll_add(sum, acc0);
	4875	acc0 = ll_get_lo(sum);
	4876	ll_add(acc21, ll_get_hi(sum));
	4877	}
	4878
	4879
	4880	#else
	4881	long m;
	4882	for (m = n-Bnd, ap += Bnd, tp += Bnd; m >= Bnd; m -= Bnd, ap += Bnd, tp += Bnd) {
	4883
	4884	ll_type sum;
	4885	ll_mul(sum, ap[0], tp[0]);
	4886
	4887	#if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	4888	ll_mul_add(sum, ap[1], tp[1]);
	4889	ll_mul_add(sum, ap[2], tp[2]);
	4890	ll_mul_add(sum, ap[3], tp[3]);
	4891	ll_mul_add(sum, ap[4], tp[4]);
	4892	ll_mul_add(sum, ap[5], tp[5]);
	4893	ll_mul_add(sum, ap[6], tp[6]);
	4894	ll_mul_add(sum, ap[7], tp[7]);
	4895	ll_mul_add(sum, ap[8], tp[8]);
	4896	ll_mul_add(sum, ap[9], tp[9]);
	4897	ll_mul_add(sum, ap[10], tp[10]);
	4898	ll_mul_add(sum, ap[11], tp[11]);
	4899	ll_mul_add(sum, ap[12], tp[12]);
	4900	ll_mul_add(sum, ap[13], tp[13]);
	4901	ll_mul_add(sum, ap[14], tp[14]);
	4902	ll_mul_add(sum, ap[15], tp[15]);
	4903	#else
	4904	for (long j = 1; j < Bnd; j++)
	4905	ll_mul_add(sum, ap[j], tp[j]);
	4906	#endif
	4907
	4908	ll_add(sum, acc0);
	4909	acc0 = ll_get_lo(sum);
	4910	ll_add(acc21, ll_get_hi(sum));
	4911	}
	4912	#endif
	4913
	4914	if (m > 0) {
	4915	ll_type sum;
	4916	ll_mul(sum, ap[0], tp[0]);
	4917
	4918	#if (CRT_ALTCODE_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	4919	switch (m) {
	4920	case 15: ll_mul_add(sum, ap[15-1], tp[15-1]);
	4921	case 14: ll_mul_add(sum, ap[14-1], tp[14-1]);
	4922	case 13: ll_mul_add(sum, ap[13-1], tp[13-1]);
	4923	case 12: ll_mul_add(sum, ap[12-1], tp[12-1]);
	4924	case 11: ll_mul_add(sum, ap[11-1], tp[11-1]);
	4925	case 10: ll_mul_add(sum, ap[10-1], tp[10-1]);
	4926	case 9: ll_mul_add(sum, ap[9-1], tp[9-1]);
	4927	case 8: ll_mul_add(sum, ap[8-1], tp[8-1]);
	4928	case 7: ll_mul_add(sum, ap[7-1], tp[7-1]);
	4929	case 6: ll_mul_add(sum, ap[6-1], tp[6-1]);
	4930	case 5: ll_mul_add(sum, ap[5-1], tp[5-1]);
	4931	case 4: ll_mul_add(sum, ap[4-1], tp[4-1]);
	4932	case 3: ll_mul_add(sum, ap[3-1], tp[3-1]);
	4933	case 2: ll_mul_add(sum, ap[2-1], tp[2-1]);
	4934	}
	4935	#else
	4936	for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
	4937	ll_mul_add(sum, ap[0], tp[0]);
	4938	#endif
	4939	ll_add(sum, acc0);
	4940	acc0 = ll_get_lo(sum);
	4941	ll_add(acc21, ll_get_hi(sum));
	4942
	4943	}
	4944
	4945	ll_add(carry, acc0);
	4946	xx[i] = ll_get_lo(carry);
	4947	ll_add(acc21, ll_get_hi(carry));
	4948	carry = acc21;
	4949	}
	4950
	4951	xx[sz] = ll_get_lo(carry);
	4952	xx[sz+1] = ll_get_hi(carry);
	4953	}
	4954
	4955
	4956	while (sx > 0 && xx[sx-1] == 0) sx--;
	4957	SIZE(x1) = sx;
	4958	}
	4959	#endif
4456	4960
4457	4961	void _ntl_crt_struct_fast::eval(_ntl_gbigint x, const long b, _ntl_tmp_vec *generic_tmp_vec)
4458	4962	{

4498	5002
4499	5003
4500	5004	bool _ntl_crt_struct_basic::special() { return false; }
	5005
	5006	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_CRT))
	5007	bool _ntl_crt_struct_tbl::special() { return false; }
	5008	#endif
	5009
	5010
4501	5011	bool _ntl_crt_struct_fast::special() { return true; }
4502	5012
4503	5013

4558	5068
4559	5069
4560	5070
4561		#ifdef NTL_TBL_REM
	5071	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
4562	5072
4563	5073	class _ntl_rem_struct_tbl : public _ntl_rem_struct {
4564	5074	public:

4579	5089	_ntl_rem_struct _ntl_rem_struct_build(long n, _ntl_gbigint modulus, long (p)(long))
4580	5090	{
4581	5091
4582		#ifdef NTL_TBL_REM
4583		if (n <= 800
4584		&& sizeof(NTL_ULL_TYPE) == 2*sizeof(long)
4585		&& NTL_ZZ_NBITS == NTL_BITS_PER_LONG) {
4586
	5092	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
	5093	if (n <= 800) {
4587	5094	UniqueArray<long> q;
4588	5095	UniqueArray<mp_limb_t> inv_primes;
4589	5096	Unique2DArray<mp_limb_t> tbl;

4627	5134
4628	5135	return R.release();
4629	5136	}
4630
4631
4632	5137	#endif
4633	5138
4634		if ( n >= 32 && n <= 256) {
	5139	if (n >= 32 && n <= 256) {
4635	5140	UniqueArray<long> q;
4636	5141	long i, j;
4637	5142	long levels, vec_len;

4815	5320	}
4816	5321
4817	5322
4818		#ifdef NTL_TBL_REM
	5323	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
4819	5324
4820	5325	_ntl_tmp_vec *_ntl_rem_struct_tbl::fetch()
4821	5326	{

4870	5375
4871	5376
4872	5377
4873		#ifdef NTL_TBL_REM
	5378
	5379	#if (defined(NTL_VIABLE_LL) && defined(NTL_TBL_REM))
4874	5380
4875	5381	static inline
4876	5382	mp_limb_t tbl_red_21(mp_limb_t hi, mp_limb_t lo, long d, mp_limb_t dinv)

4898	5404	// has exactly NTL_SP_NBITS bits. This will be the case for
4899	5405	// the FFT primes that are used.
4900	5406
	5407	static inline
	5408	mp_limb_t tbl_red_31(mp_limb_t x2, mp_limb_t x1, mp_limb_t x0,
	5409	long d, mp_limb_t dinv)
	5410	{
	5411	mp_limb_t carry = tbl_red_21(x2, x1, d, dinv);
	5412	return tbl_red_21(carry, x0, d, dinv);
	5413	}
	5414
	5415	// NOTE: tbl_red_31 assumes x2 < d
	5416
4901	5417
4902	5418	#if (NTL_SP_NBITS == NTL_BITS_PER_LONG-2)
4903	5419

4921	5437	long i;
4922	5438	for (i = 0; i < n; i++) {
4923	5439	mp_limb_t *tp = tbl[i];
4924		NTL_ULL_TYPE acc = adata[0];
	5440	ll_type acc;
	5441	ll_init(acc, adata[0]);
4925	5442	long j;
4926	5443	for (j = 1; j < sa; j++)
4927		acc += ((NTL_ULL_TYPE) adata[j]) * ((NTL_ULL_TYPE) tp[j]);
	5444	ll_mul_add(acc, adata[j], tp[j]);
4928	5445
4929	5446	mp_limb_t accvec[2];
4930		accvec[0] = acc;
4931		accvec[1] = acc >> NTL_ZZ_NBITS;
4932		x[i] = tbl_red_n1(accvec, 2, primes[i], inv_primes[i]);
	5447	x[i] = tbl_red_31(0, ll_get_hi(acc), ll_get_lo(acc), primes[i], inv_primes[i]);
4933	5448	}
4934	5449	}
4935	5450	else {

4938	5453	mp_limb_t *ap = adata;
4939	5454	mp_limb_t *tp = tbl[i];
4940	5455
4941		NTL_ULL_TYPE acc21;
	5456	ll_type acc21;
4942	5457	mp_limb_t acc0;
4943	5458
4944	5459	{
4945		NTL_ULL_TYPE sum = ap[0];
4946		sum += ((NTL_ULL_TYPE) ap[1]) * ((NTL_ULL_TYPE) tp[1]);
4947		sum += ((NTL_ULL_TYPE) ap[2]) * ((NTL_ULL_TYPE) tp[2]);
4948		sum += ((NTL_ULL_TYPE) ap[3]) * ((NTL_ULL_TYPE) tp[3]);
4949
4950		acc21 = sum >> NTL_BITS_PER_LONG;
4951		acc0 = sum;
	5460	ll_type sum;
	5461	ll_init(sum, ap[0]);
	5462
	5463	ll_mul_add(sum, ap[1], tp[1]);
	5464	ll_mul_add(sum, ap[2], tp[2]);
	5465	ll_mul_add(sum, ap[3], tp[3]);
	5466
	5467	ll_init(acc21, ll_get_hi(sum));
	5468	acc0 = ll_get_lo(sum);
4952	5469	}
4953	5470
4954		long m;
4955		for (m = sa-4, ap += 4, tp += 4; m >= 4; m -= 4, ap += 4, tp += 4) {
4956		NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
4957		sum += ((NTL_ULL_TYPE) ap[1]) * ((NTL_ULL_TYPE) tp[1]);
4958		sum += ((NTL_ULL_TYPE) ap[2]) * ((NTL_ULL_TYPE) tp[2]);
4959		sum += ((NTL_ULL_TYPE) ap[3]) * ((NTL_ULL_TYPE) tp[3]);
4960
4961		mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
4962		mp_limb_t sum0 = sum;
4963		NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
4964		mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
4965		acc0 = carry_acc0;
4966		NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
4967		acc21 += x;
	5471	long m=sa-4;
	5472	ap += 4;
	5473	tp += 4;
	5474
	5475	for (; m >= 8; m -= 8, ap += 8, tp += 8) {
	5476	{
	5477	ll_type sum;
	5478	ll_mul(sum, ap[0], tp[0]);
	5479	ll_mul_add(sum, ap[1], tp[1]);
	5480	ll_mul_add(sum, ap[2], tp[2]);
	5481	ll_mul_add(sum, ap[3], tp[3]);
	5482
	5483	ll_add(sum, acc0);
	5484	acc0 = ll_get_lo(sum);
	5485	ll_add(acc21, ll_get_hi(sum));
	5486	}
	5487	{
	5488
	5489	ll_type sum;
	5490	ll_mul(sum, ap[4+0], tp[4+0]);
	5491	ll_mul_add(sum, ap[4+1], tp[4+1]);
	5492	ll_mul_add(sum, ap[4+2], tp[4+2]);
	5493	ll_mul_add(sum, ap[4+3], tp[4+3]);
	5494
	5495	ll_add(sum, acc0);
	5496	acc0 = ll_get_lo(sum);
	5497	ll_add(acc21, ll_get_hi(sum));
	5498	}
4968	5499	}
4969	5500
	5501	for (; m >= 4; m -= 4, ap += 4, tp += 4) {
	5502	ll_type sum;
	5503	ll_mul(sum, ap[0], tp[0]);
	5504	ll_mul_add(sum, ap[1], tp[1]);
	5505	ll_mul_add(sum, ap[2], tp[2]);
	5506	ll_mul_add(sum, ap[3], tp[3]);
	5507
	5508	ll_add(sum, acc0);
	5509	acc0 = ll_get_lo(sum);
	5510	ll_add(acc21, ll_get_hi(sum));
	5511	}
	5512
4970	5513	if (m > 0) {
4971		NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
	5514	ll_type sum;
	5515	ll_mul(sum, ap[0], tp[0]);
4972	5516	for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
4973		sum += ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
4974
4975		mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
4976		mp_limb_t sum0 = sum;
4977		NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
4978		mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
4979		acc0 = carry_acc0;
4980		NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
4981		acc21 += x;
	5517	ll_mul_add(sum, ap[0], tp[0]);
	5518
	5519
	5520	ll_add(sum, acc0);
	5521	acc0 = ll_get_lo(sum);
	5522	ll_add(acc21, ll_get_hi(sum));
4982	5523	}
4983	5524
4984		mp_limb_t accvec[3];
4985		accvec[0] = acc0;
4986		accvec[1] = acc21;
4987		accvec[2] = acc21 >> NTL_BITS_PER_LONG;
4988		x[i] = tbl_red_n1(accvec, 3, primes[i], inv_primes[i]);
	5525	x[i] = tbl_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, primes[i], inv_primes[i]);
4989	5526	}
4990	5527	}
4991	5528	}
4992	5529
4993	5530	#else
4994	5531
4995		// General case: no loop unrolling
	5532	// General case: some loop unrolling (also using "Duff's Device")
	5533	// for the case where BPL-SPNBITS == 4: this is the common
	5534	// case on 64-bit machines. The loop unrolling and Duff seems
	5535	// to shave off 5-10%
	5536
	5537	#define TBL_UNROLL (1)
4996	5538
4997	5539	// DIRT: won't work if GMP has nails
4998	5540	void _ntl_rem_struct_tbl::eval(long *x, _ntl_gbigint a,

5013	5555	long i;
5014	5556	for (i = 0; i < n; i++) {
5015	5557	mp_limb_t *tp = tbl[i];
5016		NTL_ULL_TYPE acc = adata[0];
	5558
	5559
	5560	ll_type acc;
	5561	ll_init(acc, adata[0]);
	5562
	5563	#if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	5564	switch (sa) {
	5565	case 16: ll_mul_add(acc, adata[16-1], tp[16-1]);
	5566	case 15: ll_mul_add(acc, adata[15-1], tp[15-1]);
	5567	case 14: ll_mul_add(acc, adata[14-1], tp[14-1]);
	5568	case 13: ll_mul_add(acc, adata[13-1], tp[13-1]);
	5569	case 12: ll_mul_add(acc, adata[12-1], tp[12-1]);
	5570	case 11: ll_mul_add(acc, adata[11-1], tp[11-1]);
	5571	case 10: ll_mul_add(acc, adata[10-1], tp[10-1]);
	5572	case 9: ll_mul_add(acc, adata[9-1], tp[9-1]);
	5573	case 8: ll_mul_add(acc, adata[8-1], tp[8-1]);
	5574	case 7: ll_mul_add(acc, adata[7-1], tp[7-1]);
	5575	case 6: ll_mul_add(acc, adata[6-1], tp[6-1]);
	5576	case 5: ll_mul_add(acc, adata[5-1], tp[5-1]);
	5577	case 4: ll_mul_add(acc, adata[4-1], tp[4-1]);
	5578	case 3: ll_mul_add(acc, adata[3-1], tp[3-1]);
	5579	case 2: ll_mul_add(acc, adata[2-1], tp[2-1]);
	5580	}
	5581
	5582	#else
5017	5583	long j;
5018	5584	for (j = 1; j < sa; j++)
5019		acc += ((NTL_ULL_TYPE) adata[j]) * ((NTL_ULL_TYPE) tp[j]);
5020
5021		mp_limb_t accvec[2];
5022		accvec[0] = acc;
5023		accvec[1] = acc >> NTL_ZZ_NBITS;
5024		x[i] = tbl_red_n1(accvec, 2, primes[i], inv_primes[i]);
	5585	ll_mul_add(acc, adata[j], tp[j]);
	5586	#endif
	5587
	5588	x[i] = tbl_red_31(0, ll_get_hi(acc), ll_get_lo(acc), primes[i], inv_primes[i]);
5025	5589	}
5026	5590	}
5027	5591	else {

5030	5594	mp_limb_t *ap = adata;
5031	5595	mp_limb_t *tp = tbl[i];
5032	5596
5033		NTL_ULL_TYPE acc21;
	5597	ll_type acc21;
5034	5598	mp_limb_t acc0;
5035	5599
5036	5600	{
5037		NTL_ULL_TYPE sum = ap[0];
	5601	ll_type sum;
	5602	ll_init(sum, ap[0]);
	5603
	5604	#if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	5605	ll_mul_add(sum, ap[1], tp[1]);
	5606	ll_mul_add(sum, ap[2], tp[2]);
	5607	ll_mul_add(sum, ap[3], tp[3]);
	5608	ll_mul_add(sum, ap[4], tp[4]);
	5609	ll_mul_add(sum, ap[5], tp[5]);
	5610	ll_mul_add(sum, ap[6], tp[6]);
	5611	ll_mul_add(sum, ap[7], tp[7]);
	5612	ll_mul_add(sum, ap[8], tp[8]);
	5613	ll_mul_add(sum, ap[9], tp[9]);
	5614	ll_mul_add(sum, ap[10], tp[10]);
	5615	ll_mul_add(sum, ap[11], tp[11]);
	5616	ll_mul_add(sum, ap[12], tp[12]);
	5617	ll_mul_add(sum, ap[13], tp[13]);
	5618	ll_mul_add(sum, ap[14], tp[14]);
	5619	ll_mul_add(sum, ap[15], tp[15]);
	5620	#else
5038	5621	for (long j = 1; j < Bnd; j++)
5039		sum += ((NTL_ULL_TYPE) ap[j]) * ((NTL_ULL_TYPE) tp[j]);
5040
5041		acc21 = sum >> NTL_BITS_PER_LONG;
5042		acc0 = sum;
	5622	ll_mul_add(sum, ap[j], tp[j]);
	5623	#endif
	5624
	5625	ll_init(acc21, ll_get_hi(sum));
	5626	acc0 = ll_get_lo(sum);
5043	5627	}
5044	5628
5045	5629	long m;
5046	5630	for (m = sa-Bnd, ap += Bnd, tp += Bnd; m >= Bnd; m -= Bnd, ap += Bnd, tp += Bnd) {
5047		NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
	5631
	5632	ll_type sum;
	5633	ll_mul(sum, ap[0], tp[0]);
	5634
	5635	#if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	5636	ll_mul_add(sum, ap[1], tp[1]);
	5637	ll_mul_add(sum, ap[2], tp[2]);
	5638	ll_mul_add(sum, ap[3], tp[3]);
	5639	ll_mul_add(sum, ap[4], tp[4]);
	5640	ll_mul_add(sum, ap[5], tp[5]);
	5641	ll_mul_add(sum, ap[6], tp[6]);
	5642	ll_mul_add(sum, ap[7], tp[7]);
	5643	ll_mul_add(sum, ap[8], tp[8]);
	5644	ll_mul_add(sum, ap[9], tp[9]);
	5645	ll_mul_add(sum, ap[10], tp[10]);
	5646	ll_mul_add(sum, ap[11], tp[11]);
	5647	ll_mul_add(sum, ap[12], tp[12]);
	5648	ll_mul_add(sum, ap[13], tp[13]);
	5649	ll_mul_add(sum, ap[14], tp[14]);
	5650	ll_mul_add(sum, ap[15], tp[15]);
	5651	#else
5048	5652	for (long j = 1; j < Bnd; j++)
5049		sum += ((NTL_ULL_TYPE) ap[j]) * ((NTL_ULL_TYPE) tp[j]);
5050
5051		mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
5052		mp_limb_t sum0 = sum;
5053		NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
5054		mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
5055		acc0 = carry_acc0;
5056		NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
5057		acc21 += x;
	5653	ll_mul_add(sum, ap[j], tp[j]);
	5654	#endif
	5655	ll_add(sum, acc0);
	5656	acc0 = ll_get_lo(sum);
	5657	ll_add(acc21, ll_get_hi(sum));
5058	5658	}
5059	5659
5060	5660	if (m > 0) {
5061		NTL_ULL_TYPE sum = ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
	5661	ll_type sum;
	5662	ll_mul(sum, ap[0], tp[0]);
	5663
	5664	#if (TBL_UNROLL && NTL_BITS_PER_LONG-NTL_SP_NBITS == 4)
	5665	switch (m) {
	5666	case 15: ll_mul_add(sum, ap[15-1], tp[15-1]);
	5667	case 14: ll_mul_add(sum, ap[14-1], tp[14-1]);
	5668	case 13: ll_mul_add(sum, ap[13-1], tp[13-1]);
	5669	case 12: ll_mul_add(sum, ap[12-1], tp[12-1]);
	5670	case 11: ll_mul_add(sum, ap[11-1], tp[11-1]);
	5671	case 10: ll_mul_add(sum, ap[10-1], tp[10-1]);
	5672	case 9: ll_mul_add(sum, ap[9-1], tp[9-1]);
	5673	case 8: ll_mul_add(sum, ap[8-1], tp[8-1]);
	5674	case 7: ll_mul_add(sum, ap[7-1], tp[7-1]);
	5675	case 6: ll_mul_add(sum, ap[6-1], tp[6-1]);
	5676	case 5: ll_mul_add(sum, ap[5-1], tp[5-1]);
	5677	case 4: ll_mul_add(sum, ap[4-1], tp[4-1]);
	5678	case 3: ll_mul_add(sum, ap[3-1], tp[3-1]);
	5679	case 2: ll_mul_add(sum, ap[2-1], tp[2-1]);
	5680	}
	5681	#else
5062	5682	for (m--, ap++, tp++; m > 0; m--, ap++, tp++)
5063		sum += ((NTL_ULL_TYPE) ap[0]) * ((NTL_ULL_TYPE) tp[0]);
5064
5065		mp_limb_t sum1 = sum >> NTL_BITS_PER_LONG;
5066		mp_limb_t sum0 = sum;
5067		NTL_ULL_TYPE carry_acc0 = ((NTL_ULL_TYPE) acc0) + ((NTL_ULL_TYPE) sum0);
5068		mp_limb_t carry = carry_acc0 >> NTL_BITS_PER_LONG;
5069		acc0 = carry_acc0;
5070		NTL_ULL_TYPE x = ((NTL_ULL_TYPE) sum1) + ((NTL_ULL_TYPE) carry);
5071		acc21 += x;
	5683	ll_mul_add(sum, ap[0], tp[0]);
	5684	#endif
	5685	ll_add(sum, acc0);
	5686	acc0 = ll_get_lo(sum);
	5687	ll_add(acc21, ll_get_hi(sum));
5072	5688	}
5073	5689
5074		mp_limb_t accvec[3];
5075		accvec[0] = acc0;
5076		accvec[1] = acc21;
5077		accvec[2] = acc21 >> NTL_BITS_PER_LONG;
5078		x[i] = tbl_red_n1(accvec, 3, primes[i], inv_primes[i]);
	5690	x[i] = tbl_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0,
	5691	primes[i], inv_primes[i]);
5079	5692	}
5080	5693	}
5081	5694	}

5486	6099	}
5487	6100
5488	6101
5489
5490
	6102	// general preconditioned remainder
	6103
	6104
	6105
	6106	#ifndef NTL_VIABLE_LL
	6107
	6108
	6109	class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
	6110	};
	6111
	6112	_ntl_general_rem_one_struct *
	6113	_ntl_general_rem_one_struct_build(long p, long sz)
	6114	{
	6115	return 0;
	6116	}
	6117
	6118	long
	6119	_ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
	6120	{
	6121	return _ntl_gsmod(a, p);
	6122	}
	6123
	6124
	6125
	6126
	6127	#else
	6128
	6129	#define REM_ONE_THRESH (256)
	6130
	6131	class _ntl_general_rem_one_impl : public _ntl_general_rem_one_struct {
	6132	public:
	6133	long sz;
	6134	sp_ll_reduce_struct red_struct;
	6135	long Bnd;
	6136	UniqueArray<mp_limb_t> tbl;
	6137	};
	6138
	6139	_ntl_general_rem_one_struct *
	6140	_ntl_general_rem_one_struct_build(long p, long sz)
	6141	{
	6142	if (p < 2 \|\| p >= NTL_SP_BOUND)
	6143	LogicError("_ntl_general_rem_one_struct_build: bad args (p)");
	6144
	6145	if (sz < 0)
	6146	LogicError("_ntl_general_rem_one_struct_build: bad args (sz)");
	6147
	6148	if (sz > REM_ONE_THRESH) sz = REM_ONE_THRESH;
	6149
	6150	if (sz == 0) return 0;
	6151
	6152
	6153	UniquePtr<_ntl_general_rem_one_impl> ptr;
	6154	ptr.make();
	6155
	6156	ptr->sz = sz;
	6157
	6158	ptr->red_struct = make_sp_ll_reduce_struct(p);
	6159
	6160	ptr->Bnd = 1L << (NTL_BITS_PER_LONG-_ntl_g2logs(p));
	6161
	6162	ptr->tbl.SetLength(sz);
	6163
	6164	long t = 1;
	6165	for (long j = 0; j < NTL_ZZ_NBITS; j++) {
	6166	t += t;
	6167	if (t >= p) t -= p;
	6168	}
	6169
	6170	long t1 = 1;
	6171	ptr->tbl[0] = 1;
	6172	for (long j = 1; j < sz; j++) {
	6173	t1 = MulMod(t1, t, p);
	6174	ptr->tbl[j] = t1;
	6175	}
	6176
	6177	return ptr.release();
	6178	}
	6179
	6180
	6181
	6182
	6183	long
	6184	_ntl_general_rem_one_struct_apply(NTL_verylong a, long p, _ntl_general_rem_one_struct *pinfo)
	6185	{
	6186	if (ZEROP(a)) return 0;
	6187
	6188	if (!pinfo) {
	6189	return _ntl_gsmod(a, p);
	6190	}
	6191
	6192	_ntl_general_rem_one_impl ptr = (_ntl_general_rem_one_impl ) pinfo;
	6193
	6194
	6195	long sz = ptr->sz;
	6196	sp_ll_reduce_struct red_struct = ptr->red_struct;
	6197	long Bnd = ptr->Bnd;
	6198	mp_limb_t *tbl = ptr->tbl.elts();
	6199
	6200	long a_sz, a_neg;
	6201	mp_limb_t *a_data;
	6202	GET_SIZE_NEG(a_sz, a_neg, a);
	6203	a_data = DATA(a);
	6204
	6205	if (a_sz > sz) {
	6206	long res = mpn_mod_1(a_data, a_sz, p);
	6207	if (a_neg) res = NegateMod(res, p);
	6208	return res;
	6209	}
	6210	else if (a_sz <= Bnd) {
	6211	ll_type acc;
	6212	ll_init(acc, a_data[0]);
	6213
	6214	{
	6215	long j = 1;
	6216
	6217	for (; j <= a_sz-16; j += 16) {
	6218	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6219	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6220	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6221	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6222	ll_mul_add(acc, a_data[j+4], tbl[j+4]);
	6223	ll_mul_add(acc, a_data[j+5], tbl[j+5]);
	6224	ll_mul_add(acc, a_data[j+6], tbl[j+6]);
	6225	ll_mul_add(acc, a_data[j+7], tbl[j+7]);
	6226	ll_mul_add(acc, a_data[j+8], tbl[j+8]);
	6227	ll_mul_add(acc, a_data[j+9], tbl[j+9]);
	6228	ll_mul_add(acc, a_data[j+10], tbl[j+10]);
	6229	ll_mul_add(acc, a_data[j+11], tbl[j+11]);
	6230	ll_mul_add(acc, a_data[j+12], tbl[j+12]);
	6231	ll_mul_add(acc, a_data[j+13], tbl[j+13]);
	6232	ll_mul_add(acc, a_data[j+14], tbl[j+14]);
	6233	ll_mul_add(acc, a_data[j+15], tbl[j+15]);
	6234	}
	6235
	6236	for (; j <= a_sz-4; j += 4) {
	6237	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6238	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6239	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6240	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6241	}
	6242
	6243	for (; j < a_sz; j++)
	6244	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6245	}
	6246
	6247
	6248	long res = sp_ll_red_31(0, ll_get_hi(acc), ll_get_lo(acc), p, red_struct);
	6249	if (a_neg) res = NegateMod(res, p);
	6250	return res;
	6251	}
	6252	else if (Bnd > 16) {
	6253	ll_type acc21;
	6254	ll_init(acc21, 0);
	6255	mp_limb_t acc0 = 0;
	6256
	6257	long jj = 0;
	6258	for (; jj <= a_sz-Bnd; jj += Bnd) {
	6259	ll_type acc;
	6260	ll_init(acc, acc0);
	6261
	6262	long j = jj;
	6263
	6264	for (; j <= jj+Bnd-16; j += 16) {
	6265	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6266	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6267	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6268	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6269	ll_mul_add(acc, a_data[j+4], tbl[j+4]);
	6270	ll_mul_add(acc, a_data[j+5], tbl[j+5]);
	6271	ll_mul_add(acc, a_data[j+6], tbl[j+6]);
	6272	ll_mul_add(acc, a_data[j+7], tbl[j+7]);
	6273	ll_mul_add(acc, a_data[j+8], tbl[j+8]);
	6274	ll_mul_add(acc, a_data[j+9], tbl[j+9]);
	6275	ll_mul_add(acc, a_data[j+10], tbl[j+10]);
	6276	ll_mul_add(acc, a_data[j+11], tbl[j+11]);
	6277	ll_mul_add(acc, a_data[j+12], tbl[j+12]);
	6278	ll_mul_add(acc, a_data[j+13], tbl[j+13]);
	6279	ll_mul_add(acc, a_data[j+14], tbl[j+14]);
	6280	ll_mul_add(acc, a_data[j+15], tbl[j+15]);
	6281	}
	6282
	6283	acc0 = ll_get_lo(acc);
	6284	ll_add(acc21, ll_get_hi(acc));
	6285	}
	6286
	6287	if (jj < a_sz) {
	6288	ll_type acc;
	6289	ll_init(acc, acc0);
	6290
	6291	long j = jj;
	6292
	6293	for (; j <= a_sz-4; j += 4) {
	6294	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6295	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6296	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6297	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6298	}
	6299
	6300	for (; j < a_sz; j++)
	6301	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6302
	6303	acc0 = ll_get_lo(acc);
	6304	ll_add(acc21, ll_get_hi(acc));
	6305	}
	6306
	6307	long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
	6308	if (a_neg) res = NegateMod(res, p);
	6309	return res;
	6310	}
	6311	else if (Bnd == 16) {
	6312	ll_type acc21;
	6313	ll_init(acc21, 0);
	6314	mp_limb_t acc0 = 0;
	6315
	6316	long jj = 0;
	6317	for (; jj <= a_sz-16; jj += 16) {
	6318	ll_type acc;
	6319
	6320	long j = jj;
	6321
	6322	ll_mul(acc, a_data[j+0], tbl[j+0]);
	6323	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6324	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6325	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6326	ll_mul_add(acc, a_data[j+4], tbl[j+4]);
	6327	ll_mul_add(acc, a_data[j+5], tbl[j+5]);
	6328	ll_mul_add(acc, a_data[j+6], tbl[j+6]);
	6329	ll_mul_add(acc, a_data[j+7], tbl[j+7]);
	6330	ll_mul_add(acc, a_data[j+8], tbl[j+8]);
	6331	ll_mul_add(acc, a_data[j+9], tbl[j+9]);
	6332	ll_mul_add(acc, a_data[j+10], tbl[j+10]);
	6333	ll_mul_add(acc, a_data[j+11], tbl[j+11]);
	6334	ll_mul_add(acc, a_data[j+12], tbl[j+12]);
	6335	ll_mul_add(acc, a_data[j+13], tbl[j+13]);
	6336	ll_mul_add(acc, a_data[j+14], tbl[j+14]);
	6337	ll_mul_add(acc, a_data[j+15], tbl[j+15]);
	6338
	6339	ll_add(acc, acc0);
	6340	acc0 = ll_get_lo(acc);
	6341	ll_add(acc21, ll_get_hi(acc));
	6342	}
	6343
	6344	if (jj < a_sz) {
	6345	ll_type acc;
	6346	ll_init(acc, acc0);
	6347
	6348	long j = jj;
	6349
	6350	for (; j <= a_sz-4; j += 4) {
	6351	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6352	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6353	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6354	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6355	}
	6356
	6357	for (; j < a_sz; j++)
	6358	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6359
	6360	acc0 = ll_get_lo(acc);
	6361	ll_add(acc21, ll_get_hi(acc));
	6362	}
	6363
	6364	long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
	6365	if (a_neg) res = NegateMod(res, p);
	6366	return res;
	6367	}
	6368	else if (Bnd == 8) {
	6369	ll_type acc21;
	6370	ll_init(acc21, 0);
	6371	mp_limb_t acc0 = 0;
	6372
	6373	long jj = 0;
	6374	for (; jj <= a_sz-8; jj += 8) {
	6375	ll_type acc;
	6376
	6377	long j = jj;
	6378
	6379	ll_mul(acc, a_data[j+0], tbl[j+0]);
	6380	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6381	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6382	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6383	ll_mul_add(acc, a_data[j+4], tbl[j+4]);
	6384	ll_mul_add(acc, a_data[j+5], tbl[j+5]);
	6385	ll_mul_add(acc, a_data[j+6], tbl[j+6]);
	6386	ll_mul_add(acc, a_data[j+7], tbl[j+7]);
	6387
	6388	ll_add(acc, acc0);
	6389	acc0 = ll_get_lo(acc);
	6390	ll_add(acc21, ll_get_hi(acc));
	6391	}
	6392
	6393	if (jj < a_sz) {
	6394	ll_type acc;
	6395	ll_init(acc, acc0);
	6396
	6397	long j = jj;
	6398
	6399	for (; j < a_sz; j++)
	6400	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6401
	6402	acc0 = ll_get_lo(acc);
	6403	ll_add(acc21, ll_get_hi(acc));
	6404	}
	6405
	6406	long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
	6407	if (a_neg) res = NegateMod(res, p);
	6408	return res;
	6409	}
	6410	else /* Bnd == 4 */ {
	6411	ll_type acc21;
	6412	ll_init(acc21, 0);
	6413	mp_limb_t acc0 = 0;
	6414
	6415	long jj = 0;
	6416	for (; jj <= a_sz-4; jj += 4) {
	6417	ll_type acc;
	6418
	6419	long j = jj;
	6420
	6421	ll_mul(acc, a_data[j+0], tbl[j+0]);
	6422	ll_mul_add(acc, a_data[j+1], tbl[j+1]);
	6423	ll_mul_add(acc, a_data[j+2], tbl[j+2]);
	6424	ll_mul_add(acc, a_data[j+3], tbl[j+3]);
	6425
	6426
	6427	ll_add(acc, acc0);
	6428	acc0 = ll_get_lo(acc);
	6429	ll_add(acc21, ll_get_hi(acc));
	6430	}
	6431
	6432	if (jj < a_sz) {
	6433	ll_type acc;
	6434	ll_init(acc, acc0);
	6435
	6436	long j = jj;
	6437
	6438	for (; j < a_sz; j++)
	6439	ll_mul_add(acc, a_data[j+0], tbl[j+0]);
	6440
	6441
	6442	acc0 = ll_get_lo(acc);
	6443	ll_add(acc21, ll_get_hi(acc));
	6444	}
	6445
	6446	long res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, red_struct);
	6447	if (a_neg) res = NegateMod(res, p);
	6448	return res;
	6449	}
	6450	}
	6451
	6452
	6453
	6454
	6455
	6456
	6457
	6458
	6459
	6460	#endif
	6461
	6462
	6463

+186

-7

src/lzz_p.c less more

3	3	#include <NTL/new.h>
4	4
5	5	NTL_START_IMPL
	6
	7
	8	NTL_TLS_GLOBAL_DECL(SmartPtr<zz_pInfoT>, zz_pInfo_stg)
	9
	10	NTL_CHEAP_THREAD_LOCAL zz_pInfoT *zz_pInfo = 0;
	11
	12
6	13
7	14	SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info)
8	15	{

25	32	p = NewP;
26	33	pinv = PrepMulMod(p);
27	34	red_struct = sp_PrepRem(p);
	35	ll_red_struct = make_sp_ll_reduce_struct(p);
28	36
29	37	p_info = 0;
30	38

81	89	p = info->q;
82	90	pinv = info->qinv;
83	91	red_struct = sp_PrepRem(p);
	92	ll_red_struct = make_sp_ll_reduce_struct(p);
84	93
85	94
86	95	p_info = info;

101	110	p = q;
102	111	pinv = PrepMulMod(p);
103	112	red_struct = sp_PrepRem(p);
	113	ll_red_struct = make_sp_ll_reduce_struct(p);
104	114
105	115
106	116	p_info_owner.make();

119	129	}
120	130
121	131
122
123		NTL_THREAD_LOCAL SmartPtr<zz_pInfoT> zz_pInfo = 0;
124
125
126
127	132	void zz_p::init(long p, long maxroot)
128	133	{
129	134	zz_pContext c(p, maxroot);

164	169
165	170	void zz_pContext::save()
166	171	{
167		ptr = zz_pInfo;
	172	NTL_TLS_GLOBAL_ACCESS(zz_pInfo_stg);
	173	ptr = zz_pInfo_stg;
168	174	}
169	175
170	176	void zz_pContext::restore() const
171	177	{
172		zz_pInfo = ptr;
	178	NTL_TLS_GLOBAL_ACCESS(zz_pInfo_stg);
	179	zz_pInfo_stg = ptr;
	180	zz_pInfo = zz_pInfo_stg.get();
173	181	}
174	182
175	183

225	233	return s;
226	234	}
227	235
	236
	237
	238	// ***********************************************************************
	239
	240
	241	#ifdef NTL_HAVE_LL_TYPE
	242
	243
	244	// NOTE: the following code sequence will generate imulq
	245	// instructions on x86_64 machines, which empirically is faster
	246	// than using the mulq instruction or even the mulxq instruction,
	247	// (tested on a Haswell machine).
	248
	249	long
	250	InnerProd_LL(const long ap, const zz_p bp, long n, long d,
	251	sp_ll_reduce_struct dinv)
	252	{
	253	const long BLKSIZE = (1L << min(20, 2*(NTL_BITS_PER_LONG-NTL_SP_NBITS)));
	254
	255	unsigned long acc0 = 0;
	256	ll_type acc21;
	257	ll_init(acc21, 0);
	258
	259	long i;
	260	for (i = 0; i <= n-BLKSIZE; i += BLKSIZE, ap += BLKSIZE, bp += BLKSIZE) {
	261	// sum ap[j]*rep(bp[j]) for j in [0..BLKSIZE)
	262
	263	ll_type sum;
	264	ll_init(sum, 0);
	265	for (long j = 0; j < BLKSIZE; j += 4) {
	266	ll_imul_add(sum, ap[j+0], rep(bp[j+0]));
	267	ll_imul_add(sum, ap[j+1], rep(bp[j+1]));
	268	ll_imul_add(sum, ap[j+2], rep(bp[j+2]));
	269	ll_imul_add(sum, ap[j+3], rep(bp[j+3]));
	270	}
	271
	272	ll_add(sum, acc0);
	273	acc0 = ll_get_lo(sum);
	274	ll_add(acc21, ll_get_hi(sum));
	275	}
	276
	277	if (i < n) {
	278	// sum ap[i]*rep(bp[j]) for j in [0..n-i)
	279
	280	ll_type sum;
	281	ll_init(sum, 0);
	282	long j = 0;
	283	for (; j <= n-i-4; j += 4) {
	284	ll_imul_add(sum, ap[j+0], rep(bp[j+0]));
	285	ll_imul_add(sum, ap[j+1], rep(bp[j+1]));
	286	ll_imul_add(sum, ap[j+2], rep(bp[j+2]));
	287	ll_imul_add(sum, ap[j+3], rep(bp[j+3]));
	288	}
	289
	290	for (; j < n-i; j++)
	291	ll_imul_add(sum, ap[j], rep(bp[j]));
	292
	293
	294	ll_add(sum, acc0);
	295	acc0 = ll_get_lo(sum);
	296	ll_add(acc21, ll_get_hi(sum));
	297	}
	298
	299	if (dinv.nbits == NTL_SP_NBITS)
	300	return sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
	301	else
	302	return sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
	303	}
	304
	305
	306	long
	307	InnerProd_LL(const zz_p ap, const zz_p bp, long n, long d,
	308	sp_ll_reduce_struct dinv)
	309	{
	310	const long BLKSIZE = (1L << min(20, 2*(NTL_BITS_PER_LONG-NTL_SP_NBITS)));
	311
	312	unsigned long acc0 = 0;
	313	ll_type acc21;
	314	ll_init(acc21, 0);
	315
	316	long i;
	317	for (i = 0; i <= n-BLKSIZE; i += BLKSIZE, ap += BLKSIZE, bp += BLKSIZE) {
	318	// sum ap[j]*rep(bp[j]) for j in [0..BLKSIZE)
	319
	320	ll_type sum;
	321	ll_init(sum, 0);
	322	for (long j = 0; j < BLKSIZE; j += 4) {
	323	ll_imul_add(sum, rep(ap[j+0]), rep(bp[j+0]));
	324	ll_imul_add(sum, rep(ap[j+1]), rep(bp[j+1]));
	325	ll_imul_add(sum, rep(ap[j+2]), rep(bp[j+2]));
	326	ll_imul_add(sum, rep(ap[j+3]), rep(bp[j+3]));
	327	}
	328
	329	ll_add(sum, acc0);
	330	acc0 = ll_get_lo(sum);
	331	ll_add(acc21, ll_get_hi(sum));
	332	}
	333
	334	if (i < n) {
	335	// sum ap[i]*rep(bp[j]) for j in [0..n-i)
	336
	337	ll_type sum;
	338	ll_init(sum, 0);
	339	long j = 0;
	340	for (; j <= n-i-4; j += 4) {
	341	ll_imul_add(sum, rep(ap[j+0]), rep(bp[j+0]));
	342	ll_imul_add(sum, rep(ap[j+1]), rep(bp[j+1]));
	343	ll_imul_add(sum, rep(ap[j+2]), rep(bp[j+2]));
	344	ll_imul_add(sum, rep(ap[j+3]), rep(bp[j+3]));
	345	}
	346
	347	for (; j < n-i; j++)
	348	ll_imul_add(sum, rep(ap[j]), rep(bp[j]));
	349
	350
	351	ll_add(sum, acc0);
	352	acc0 = ll_get_lo(sum);
	353	ll_add(acc21, ll_get_hi(sum));
	354	}
	355
	356	if (dinv.nbits == NTL_SP_NBITS)
	357	return sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
	358	else
	359	return sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, d, dinv);
	360	}
	361
	362
	363	long
	364	InnerProd_L(const long ap, const zz_p bp, long n, long d,
	365	sp_reduce_struct dinv)
	366	{
	367	unsigned long sum = 0;
	368	long j = 0;
	369
	370	for (; j <= n-4; j += 4) {
	371	sum += (ap[j+0]) * (rep(bp[j+0]));
	372	sum += (ap[j+1]) * (rep(bp[j+1]));
	373	sum += (ap[j+2]) * (rep(bp[j+2]));
	374	sum += (ap[j+3]) * (rep(bp[j+3]));
	375	}
	376
	377	for (; j < n; j++)
	378	sum += (ap[j]) * (rep(bp[j]));
	379
	380	return rem(sum, d, dinv);
	381	}
	382
	383	long
	384	InnerProd_L(const zz_p ap, const zz_p bp, long n, long d,
	385	sp_reduce_struct dinv)
	386	{
	387	unsigned long sum = 0;
	388	long j = 0;
	389
	390	for (; j <= n-4; j += 4) {
	391	sum += (rep(ap[j+0])) * (rep(bp[j+0]));
	392	sum += (rep(ap[j+1])) * (rep(bp[j+1]));
	393	sum += (rep(ap[j+2])) * (rep(bp[j+2]));
	394	sum += (rep(ap[j+3])) * (rep(bp[j+3]));
	395	}
	396
	397	for (; j < n; j++)
	398	sum += (rep(ap[j])) * (rep(bp[j]));
	399
	400	return rem(sum, d, dinv);
	401	}
	402
	403	#endif
	404
	405
	406
228	407	NTL_END_IMPL

+12

-6

src/lzz_pE.c less more

4	4	#include <NTL/new.h>
5	5
6	6	NTL_START_IMPL
	7
	8
	9	NTL_TLS_GLOBAL_DECL(SmartPtr<zz_pEInfoT>, zz_pEInfo_stg)
	10
	11	NTL_CHEAP_THREAD_LOCAL zz_pEInfoT *zz_pEInfo = 0;
	12
7	13
8	14	zz_pEInfoT::zz_pEInfoT(const zz_pX& NewP)
9	15	{

34	40
35	41
36	42
37		NTL_THREAD_LOCAL SmartPtr<zz_pEInfoT> zz_pEInfo = 0;
38
39
40	43	void zz_pE::init(const zz_pX& p)
41	44	{
42	45	zz_pEContext c(p);

46	49
47	50	void zz_pEContext::save()
48	51	{
49		ptr = zz_pEInfo;
	52	NTL_TLS_GLOBAL_ACCESS(zz_pEInfo_stg);
	53	ptr = zz_pEInfo_stg;
50	54	}
51	55
52	56	void zz_pEContext::restore() const
53	57	{
54		zz_pEInfo = ptr;
	58	NTL_TLS_GLOBAL_ACCESS(zz_pEInfo_stg);
	59	zz_pEInfo_stg = ptr;
	60	zz_pEInfo = zz_pEInfo_stg.get();
55	61	}
56	62
57	63

77	83
78	84	const zz_pE& zz_pE::zero()
79	85	{
80		NTL_THREAD_LOCAL static zz_pE z(INIT_NO_ALLOC);
	86	static const zz_pE z(INIT_NO_ALLOC); // GLOBAL (assumes C++11 thread-safe init)
81	87	return z;
82	88	}
83	89

-2

src/lzz_pEX.c less more

12	12
13	13	const zz_pEX& zz_pEX::zero()
14	14	{
15		NTL_THREAD_LOCAL static zz_pEX z;
	15	static const zz_pEX z; // GLOBAL (assumes C++11 thread-safe init)
16	16	return z;
17	17	}
18	18

2214	2214	MulMod(A.H[i], A.H[i-1], h, F);
2215	2215	}
2216	2216
2217		NTL_THREAD_LOCAL long zz_pEXArgBound = 0;
	2217	NTL_CHEAP_THREAD_LOCAL long zz_pEXArgBound = 0;
2218	2218
2219	2219
2220	2220

-6

src/lzz_pEXFactoring.c less more

356	356	}
357	357
358	358
359		NTL_THREAD_LOCAL long zz_pEX_BlockingFactor = 10;
	359	NTL_CHEAP_THREAD_LOCAL long zz_pEX_BlockingFactor = 10;
360	360
361	361
362	362

1060	1060
1061	1061	/*********** NEW DDF **************/
1062	1062
1063		NTL_THREAD_LOCAL long zz_pEX_GCDTableSize = 4;
1064		NTL_THREAD_LOCAL double zz_pEXFileThresh = NTL_FILE_THRESH;
1065		NTL_THREAD_LOCAL static vec_zz_pEX *BabyStepFile=0;
1066		NTL_THREAD_LOCAL static vec_zz_pEX *GiantStepFile=0;
1067		NTL_THREAD_LOCAL static long use_files;
	1063	NTL_CHEAP_THREAD_LOCAL long zz_pEX_GCDTableSize = 4;
	1064	NTL_CHEAP_THREAD_LOCAL double zz_pEXFileThresh = NTL_FILE_THRESH;
	1065	static NTL_CHEAP_THREAD_LOCAL vec_zz_pEX *BabyStepFile=0;
	1066	static NTL_CHEAP_THREAD_LOCAL vec_zz_pEX *GiantStepFile=0;
	1067	static NTL_CHEAP_THREAD_LOCAL long use_files;
1068	1068
1069	1069
1070	1070	static

+207

-58

src/lzz_pX.c less more

21	21
22	22	const zz_pX& zz_pX::zero()
23	23	{
24		NTL_THREAD_LOCAL static zz_pX z;
	24	static const zz_pX z; // GLOBAL (assumes C++11 thread-safe init)
25	25	return z;
26	26	}
27	27

1465	1465	}
1466	1466
1467	1467
	1468	#if 0
	1469	// converts entries lo..lo+cnt-1 in R and stores results into res
	1470	static
	1471	void FromModularRep(zz_p* res, const fftRep& R, long lo, long cnt,
	1472	zz_pInfoT* info)
	1473	{
	1474	if (cnt <= 0) return;
	1475
	1476	long nprimes = info->NumPrimes;
	1477	long p = info->p;
	1478	mulmod_t pinv = info->pinv;
	1479	long *CoeffModP = info->CoeffModP.elts();
	1480	double *x = info->x.elts();
	1481	long *u = info->u.elts();
	1482	mulmod_precon_t *uqinv = info->uqinv.elts();
	1483	long MinusMModP = info->MinusMModP;
	1484	mulmod_precon_t MinusMModPpinv = info->MinusMModPpinv;
	1485	mulmod_precon_t *CoeffModPpinv = info->CoeffModPpinv.elts();
	1486
	1487	long primes[4];
	1488	double prime_recip[4];
	1489	long *tbl[4];
	1490
	1491	long q, s, t;
	1492	long i, j;
	1493	double y;
	1494
	1495	for (i = 0; i < nprimes; i++) {
	1496	primes[i] = GetFFTPrime(i);
	1497	prime_recip[i] = GetFFTPrimeRecip(i);
	1498	tbl[i] = R.tbl[i].get();
	1499	}
	1500
	1501	for (j = 0; j < cnt; j++) {
	1502	y = double(0L);
	1503	t = 0;
	1504
	1505	for (i = 0; i < nprimes; i++) {
	1506	s = MulModPrecon(tbl[i][j+lo], u[i], primes[i], uqinv[i]);
	1507	y = y + double(s)*prime_recip[i];
	1508
	1509
	1510	// DIRT: uses undocumented MulMod feature (see sp_arith.h)
	1511	// input s is not reduced mod p
	1512	s = MulModPrecon(s, CoeffModP[i], p, CoeffModPpinv[i]);
	1513
	1514	t = AddMod(t, s, p);
	1515	}
	1516
	1517	q = (long) (y + 0.5);
	1518
	1519	// DIRT: uses undocumented MulMod feature (see sp_arith.h)
	1520	// input q may not be reduced mod p
	1521	s = MulModPrecon(q, MinusMModP, p, MinusMModPpinv);
	1522
	1523	t = AddMod(t, s, p);
	1524	res[j].LoopHole() = t;
	1525	}
	1526
	1527	}
	1528	#else
	1529
	1530	#define NTL_FMR_LOOP_BODY(i) \
	1531	s = MulModPrecon(tbl[i][j+lo], u[i], primes[i], uqinv[i]);\
	1532	y = y + double(s)*prime_recip[i];\
	1533	\
	1534	\
	1535	/* DIRT: uses undocumented MulMod feature (see sp_arith.h) */\
	1536	/* input s is not reduced mod p */\
	1537	s = MulModPrecon(s, CoeffModP[i], p, CoeffModPpinv[i]);\
	1538	\
	1539	t = AddMod(t, s, p);\
	1540
	1541
	1542	#define NTL_FMP_OUTER_LOOP(XXX) \
	1543	for (j = 0; j < cnt; j++) {\
	1544	y = double(0L);\
	1545	t = 0;\
	1546	XXX \
	1547	q = (long) (y + 0.5);\
	1548	/* DIRT: uses undocumented MulMod feature (see sp_arith.h) */\
	1549	/* input q may not be reduced mod p */\
	1550	s = MulModPrecon(q, MinusMModP, p, MinusMModPpinv);\
	1551	t = AddMod(t, s, p);\
	1552	res[j].LoopHole() = t;\
	1553	}\
	1554
	1555
	1556
	1557	// converts entries lo..lo+cnt-1 in R and stores results into res
	1558	static
	1559	void FromModularRep(zz_p* res, const fftRep& R, long lo, long cnt,
	1560	zz_pInfoT* info)
	1561	{
	1562	if (cnt <= 0) return;
	1563
	1564	long nprimes = info->NumPrimes;
	1565	long p = info->p;
	1566	mulmod_t pinv = info->pinv;
	1567	long *CoeffModP = info->CoeffModP.elts();
	1568	double *x = info->x.elts();
	1569	long *u = info->u.elts();
	1570	mulmod_precon_t *uqinv = info->uqinv.elts();
	1571	long MinusMModP = info->MinusMModP;
	1572	mulmod_precon_t MinusMModPpinv = info->MinusMModPpinv;
	1573	mulmod_precon_t *CoeffModPpinv = info->CoeffModPpinv.elts();
	1574
	1575	long primes[4];
	1576	double prime_recip[4];
	1577	long *tbl[4];
	1578
	1579	long q, s, t;
	1580	long i, j;
	1581	double y;
	1582
	1583	for (i = 0; i < nprimes; i++) {
	1584	primes[i] = GetFFTPrime(i);
	1585	prime_recip[i] = GetFFTPrimeRecip(i);
	1586	tbl[i] = R.tbl[i].get();
	1587	}
	1588
	1589	if (nprimes == 1) {
	1590	long *tbl_0 = tbl[0];
	1591	mulmod_precon_t CoeffModPpinv_0 = CoeffModPpinv[0];
	1592	long primes_0 = primes[0];
	1593	long hp0 = primes_0 >> 1;
	1594
	1595	for (j = 0; j < cnt; j++) {
	1596	s = tbl_0[j+lo];
	1597
	1598	// DIRT: uses undocumented MulMod feature (see sp_arith.h)
	1599	// input s is not reduced mod p
	1600	t = MulModPrecon(s, 1, p, CoeffModPpinv_0);
	1601
	1602	res[j].LoopHole() = AddMod(t, sp_SignMask(hp0-s) & MinusMModP, p);
	1603	}
	1604	}
	1605	else if (nprimes == 2) {
	1606	NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) )
	1607	}
	1608	else if (nprimes == 3) {
	1609	NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) NTL_FMR_LOOP_BODY(2) )
	1610	}
	1611	else { // nprimes == 4
	1612	NTL_FMP_OUTER_LOOP( NTL_FMR_LOOP_BODY(0) NTL_FMR_LOOP_BODY(1) NTL_FMR_LOOP_BODY(2) NTL_FMR_LOOP_BODY(3) )
	1613	}
	1614	}
	1615
	1616
	1617
	1618
	1619	#endif
	1620
	1621
	1622
1468	1623
1469	1624	void TofftRep(fftRep& y, const zz_pX& x, long k, long lo, long hi)
1470	1625	// computes an n = 2^k point convolution.
1471	1626	// if deg(x) >= 2^k, then x is first reduced modulo X^n-1.
1472	1627	{
1473		zz_pInfoT *info = zz_pInfo.get();
	1628	zz_pInfoT *info = zz_pInfo;
1474	1629	long p = info->p;
1475	1630
1476	1631	long n, i, j, m, j1;
1477	1632	long accum;
1478		long NumPrimes = info->NumPrimes;
	1633	long nprimes = info->NumPrimes;
1479	1634
1480	1635
1481	1636	if (k > info->MaxRoot)

1497	1652	FFTPrimeInfo *p_info = info->p_info;
1498	1653
1499	1654	if (p_info) {
1500		for (j = 0; j < n; j++) {
1501		if (j >= m) {
1502		y.tbl[0][j] = 0;
	1655	if (n >= m) {
	1656	long *yp = &y.tbl[0][0];
	1657	for (j = 0; j < m; j++) {
	1658	yp[j] = rep(xx[j+lo]);
1503	1659	}
1504		else {
	1660	for (j = m; j < n; j++) {
	1661	yp[j] = 0;
	1662	}
	1663	}
	1664	else {
	1665	for (j = 0; j < n; j++) {
1505	1666	accum = rep(xx[j+lo]);
1506	1667	for (j1 = j + n; j1 < m; j1 += n)
1507	1668	accum = AddMod(accum, rep(xx[j1+lo]), p);

1510	1671	}
1511	1672	}
1512	1673	else {
1513		for (j = 0; j < n; j++) {
1514		if (j >= m) {
1515		for (i = 0; i < NumPrimes; i++)
1516		y.tbl[i][j] = 0;
	1674	if (n >= m) {
	1675	for (i = 0; i < nprimes; i++) {
	1676	long q = GetFFTPrime(i);
	1677	long *yp = &y.tbl[i][0];
	1678	for (j = 0; j < m; j++) {
	1679	long t = rep(xx[j+lo]);
	1680	t = sp_CorrectExcess(t, q);
	1681	yp[j] = t;
	1682	}
	1683	for (j = m; j < n; j++) {
	1684	yp[j] = 0;
	1685	}
1517	1686	}
1518		else {
	1687	}
	1688	else {
	1689	for (j = 0; j < n; j++) {
1519	1690	accum = rep(xx[j+lo]);
1520	1691	for (j1 = j + n; j1 < m; j1 += n)
1521	1692	accum = AddMod(accum, rep(xx[j1+lo]), p);
1522		for (i = 0; i < NumPrimes; i++) {
	1693	for (i = 0; i < nprimes; i++) {
1523	1694	long q = GetFFTPrime(i);
1524	1695	long t = accum;
1525		if (t >= q) t -= q;
	1696	t = sp_CorrectExcess(t, q);
1526	1697	y.tbl[i][j] = t;
1527	1698	}
1528	1699	}

1535	1706	FFTFwd(yp, yp, k, *p_info);
1536	1707	}
1537	1708	else {
1538		for (i = 0; i < info->NumPrimes; i++) {
	1709	for (i = 0; i < nprimes; i++) {
1539	1710	long *yp = &y.tbl[i][0];
1540	1711	FFTFwd(yp, yp, k, i);
1541	1712	}

1550	1721	// using "inverted" evaluation points.
1551	1722
1552	1723	{
1553		zz_pInfoT *info = zz_pInfo.get();
	1724	zz_pInfoT *info = zz_pInfo;
1554	1725	long p = info->p;
1555	1726
1556	1727	long n, i, j, m, j1;

1604	1775	for (i = 0; i < NumPrimes; i++) {
1605	1776	long q = GetFFTPrime(i);
1606	1777	long t = accum;
1607		if (t >= q) t -= q;
	1778	t = sp_CorrectExcess(t, q);
1608	1779	y.tbl[i][offset] = t;
1609	1780	}
1610	1781	}

1632	1803
1633	1804
1634	1805	{
1635		zz_pInfoT *info = zz_pInfo.get();
	1806	zz_pInfoT *info = zz_pInfo;
1636	1807
1637	1808	long k, n, i, j, l;
1638	1809	long NumPrimes = info->NumPrimes;
1639	1810
1640		long t[4];
1641	1811
1642	1812	k = y.k;
1643	1813	n = (1L << k);

1667	1837	xp[j].LoopHole() = yp[j+lo];
1668	1838	}
1669	1839	else {
1670		for (j = 0; j < l; j++) {
1671		for (i = 0; i < NumPrimes; i++)
1672		t[i] = y.tbl[i][j+lo];
1673
1674		FromModularRep(x.rep[j], t, info);
1675		}
	1840	FromModularRep(x.rep.elts(), y, lo, l, info);
1676	1841	}
1677	1842
1678	1843	x.normalize();

1686	1851
1687	1852
1688	1853	{
1689		zz_pInfoT *info = zz_pInfo.get();
	1854	zz_pInfoT *info = zz_pInfo;
1690	1855
1691	1856	long k, n, i, j, l;
1692	1857	long NumPrimes = info->NumPrimes;
1693	1858
1694		long t[4];
1695	1859
1696	1860	k = y.k;
1697	1861	n = (1L << k);

1721	1885	xp[j].LoopHole() = yp[j+lo];
1722	1886	}
1723	1887	else {
1724		for (j = 0; j < l; j++) {
1725		for (i = 0; i < NumPrimes; i++)
1726		t[i] = y.tbl[i][j+lo];
1727
1728		FromModularRep(x[j], t, info);
1729		}
	1888	FromModularRep(x.elts(), y, lo, l, info);
1730	1889	}
1731	1890	}
1732	1891
1733	1892	void NDFromfftRep(zz_pX& x, const fftRep& y, long lo, long hi, fftRep& z)
1734	1893	{
1735		zz_pInfoT *info = zz_pInfo.get();
	1894	zz_pInfoT *info = zz_pInfo;
1736	1895
1737	1896	long k, n, i, j, l;
1738	1897	long NumPrimes = info->NumPrimes;
1739	1898
1740		long t[4];
1741	1899
1742	1900	k = y.k;
1743	1901	n = (1L << k);

1771	1929	xp[j].LoopHole() = zp[j+lo];
1772	1930	}
1773	1931	else {
1774		for (j = 0; j < l; j++) {
1775		for (i = 0; i < NumPrimes; i++)
1776		t[i] = z.tbl[i][j+lo];
1777
1778		FromModularRep(x.rep[j], t, info);
1779		}
	1932	FromModularRep(x.rep.elts(), z, lo, l, info);
1780	1933	}
1781	1934
1782	1935	x.normalize();

1795	1948
1796	1949
1797	1950	{
1798		zz_pInfoT *info = zz_pInfo.get();
	1951	zz_pInfoT *info = zz_pInfo;
1799	1952
1800	1953	long k, n, i, j;
1801	1954	long NumPrimes = info->NumPrimes;
1802	1955
1803		long t[4];
1804	1956
1805	1957	k = y.k;
1806	1958	n = (1L << k);

1824	1976	long *yp = &y.tbl[i][0];
1825	1977	FFTRev1(yp, yp, k, i);
1826	1978	}
	1979
	1980	// take coefficients lo..min(hi, n-1) from y
	1981	// zero out coefficients max(n, lo)..hi
1827	1982
1828		for (j = lo; j <= hi; j++) {
1829		if (j >= n)
1830		clear(x[j-lo]);
1831		else {
1832		for (i = 0; i < info->NumPrimes; i++)
1833		t[i] = y.tbl[i][j];
1834
1835		FromModularRep(x[j-lo], t, info);
1836		}
1837		}
	1983	long l = min(hi, n-1) - lo + 1;
	1984	l = max(l, 0);
	1985	FromModularRep(x, y, lo, l, info);
	1986	for (j = max(n, lo); j <= hi; j++) clear(x[j-lo]);
1838	1987	}
1839	1988	}
1840	1989
1841	1990
1842	1991	void mul(fftRep& z, const fftRep& x, const fftRep& y)
1843	1992	{
1844		zz_pInfoT *info = zz_pInfo.get();
	1993	zz_pInfoT *info = zz_pInfo;
1845	1994
1846	1995	long k, n, i, j;
1847	1996

1886	2035
1887	2036	void sub(fftRep& z, const fftRep& x, const fftRep& y)
1888	2037	{
1889		zz_pInfoT *info = zz_pInfo.get();
	2038	zz_pInfoT *info = zz_pInfo;
1890	2039
1891	2040	long k, n, i, j;
1892	2041

1923	2072
1924	2073	void add(fftRep& z, const fftRep& x, const fftRep& y)
1925	2074	{
1926		zz_pInfoT *info = zz_pInfo.get();
	2075	zz_pInfoT *info = zz_pInfo;
1927	2076
1928	2077	long k, n, i, j;
1929	2078

1963	2112	// reduces a 2^l point FFT-rep to a 2^k point FFT-rep
1964	2113	// input may alias output
1965	2114	{
1966		zz_pInfoT *info = zz_pInfo.get();
	2115	zz_pInfoT *info = zz_pInfo;
1967	2116
1968	2117	long i, j, l, n;
1969	2118	long* xp;

1987	2136	void AddExpand(fftRep& x, const fftRep& a)
1988	2137	// x = x + (an "expanded" version of a)
1989	2138	{
1990		zz_pInfoT *info = zz_pInfo.get();
	2139	zz_pInfoT *info = zz_pInfo;
1991	2140
1992	2141	long i, j, l, k, n;
1993	2142

+521

-3

src/lzz_pX1.c less more

0
1	0
2	1	#include <NTL/lzz_pX.h>
3
4	2	#include <NTL/new.h>
	3
	4	#ifdef NTL_HAVE_AVX
	5	#include <immintrin.h>
	6	#endif
	7
5	8
6	9	NTL_START_IMPL
7	10

989	992
990	993
991	994
992		NTL_THREAD_LOCAL long zz_pXArgBound = 0;
	995	NTL_CHEAP_THREAD_LOCAL long zz_pXArgBound = 0;
993	996
994	997
995	998	void CompMod(zz_pX& x, const zz_pX& g, const zz_pX& h, const zz_pXModulus& F)

1065	1068	x2 = xx2;
1066	1069	x3 = xx3;
1067	1070	}
	1071
	1072
	1073	// BEGIN zz_pXAltArgument variation
	1074
	1075
	1076
	1077
	1078	void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
	1079	{
	1080	altH.orig = &H;
	1081
	1082
	1083	#ifdef NTL_HAVE_LL_TYPE
	1084	altH.mem.kill();
	1085	altH.row.kill();
	1086
	1087	#ifdef NTL_HAVE_AVX
	1088	altH.dmem.kill();
	1089	altH.drow.kill();
	1090	#endif
	1091
	1092	if (H.H.length() < 10 \|\| F.n < 50) { altH.strategy = 0; return; }
	1093
	1094	altH.n = F.n;
	1095	altH.m = H.H.length()-1;
	1096
	1097	long p = zz_p::modulus();
	1098	long n = altH.n;
	1099	long m = altH.m;
	1100
	1101
	1102	#ifdef NTL_HAVE_AVX
	1103	if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) &&
	1104	m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) {
	1105	altH.strategy = 3;
	1106	altH.pinv_L = sp_PrepRem(p);
	1107	}
	1108	else
	1109	#endif
	1110	if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
	1111	cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
	1112	altH.strategy = 1;
	1113	altH.pinv_L = sp_PrepRem(p);
	1114	}
	1115	else {
	1116	altH.strategy = 2;
	1117	altH.pinv_LL = make_sp_ll_reduce_struct(p);
	1118	}
	1119
	1120
	1121	if (altH.strategy == 1 \|\| altH.strategy == 2) {
	1122
	1123	altH.row.SetLength(n);
	1124	long **row = altH.row.elts();
	1125
	1126	const long AllocAmt = 1L << 18;
	1127
	1128	long BlockSize = (AllocAmt + m - 1)/m;
	1129	long NumBlocks = (n + BlockSize - 1)/BlockSize;
	1130
	1131	altH.mem.SetLength(NumBlocks);
	1132
	1133	for (long i = 0; i < NumBlocks; i++) {
	1134	long first = i*BlockSize;
	1135	long last = min(n, first + BlockSize);
	1136	altH.mem[i].SetLength((last-first)*m);
	1137	for (long j = first; j < last; j++) {
	1138	row[j] = altH.mem[i].elts() + (j-first)*m;
	1139	}
	1140	}
	1141
	1142	for (long i = 0; i < m; i++) {
	1143	const zz_p* ptr = H.H[i].rep.elts();
	1144	long len = H.H[i].rep.length();
	1145	for (long j = 0; j < len; j++)
	1146	row[j][i] = rep(ptr[j]);
	1147	for (long j = len; j < n; j++)
	1148	row[j][i] = 0;
	1149	}
	1150	}
	1151	#ifdef NTL_HAVE_AVX
	1152	else {
	1153
	1154	// sanity check
	1155	if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow");
	1156
	1157	long npanels = (n+15)/16;
	1158	long panel_size = 16*m;
	1159
	1160	const long AllocAmt = 1L << 18;
	1161
	1162	long BlockSize = (AllocAmt + panel_size - 1)/panel_size;
	1163	long NumBlocks = (npanels + BlockSize - 1)/BlockSize;
	1164
	1165	altH.dmem.SetLength(NumBlocks);
	1166	altH.drow.SetLength(npanels);
	1167	double **drow = altH.drow.elts();
	1168
	1169	for (long i = 0; i < NumBlocks; i++) {
	1170	long first = i*BlockSize;
	1171	long last = min(npanels, first + BlockSize);
	1172	altH.dmem[i].SetLength((last-first)*panel_size);
	1173
	1174	double *ptr = altH.dmem[i].get();
	1175
	1176	for (long j = first; j < last; j++)
	1177	drow[j] = ptr + (j-first)*panel_size;
	1178	}
	1179
	1180	for (long i = 0; i < m; i++) {
	1181	const zz_p *ptr = H.H[i].rep.elts();
	1182	long len = H.H[i].rep.length();
	1183	for (long j = 0; j < len; j++)
	1184	drow[j/16][(i*16) + (j%16)] = rep(ptr[j]);
	1185	for (long j = len; j < npanels*16; j++)
	1186	drow[j/16][(i*16) + (j%16)] = 0;
	1187	}
	1188	}
	1189
	1190	#endif
	1191
	1192
	1193	#endif
	1194	}
	1195
	1196
	1197	#ifdef NTL_HAVE_LL_TYPE
	1198
	1199
	1200	#ifdef NTL_HAVE_AVX
	1201	static
	1202	void mul16rowsD(double x, const double a, const double *b, long n)
	1203	{
	1204	__m256d avec0, avec1, avec2, avec3;
	1205
	1206	__m256d acc0 = _mm256_setzero_pd();
	1207	__m256d acc1 = _mm256_setzero_pd();
	1208	__m256d acc2 = _mm256_setzero_pd();
	1209	__m256d acc3 = _mm256_setzero_pd();
	1210
	1211	__m256d bvec;
	1212
	1213	for (long i = 0; i < n; i++) {
	1214	bvec = _mm256_broadcast_sd(&b[i]);
	1215
	1216	avec0 = _mm256_load_pd(a); a += 4;
	1217	avec1 = _mm256_load_pd(a); a += 4;
	1218	avec2 = _mm256_load_pd(a); a += 4;
	1219	avec3 = _mm256_load_pd(a); a += 4;
	1220
	1221	#ifdef NTL_HAVE_FMA
	1222
	1223	acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
	1224	acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
	1225	acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
	1226	acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
	1227
	1228	#else
	1229
	1230	acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
	1231	acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
	1232	acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
	1233	acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
	1234
	1235	#endif
	1236
	1237	}
	1238
	1239	_mm256_store_pd(x + 0*4, acc0);
	1240	_mm256_store_pd(x + 1*4, acc1);
	1241	_mm256_store_pd(x + 2*4, acc2);
	1242	_mm256_store_pd(x + 3*4, acc3);
	1243	}
	1244
	1245	static
	1246	void mul16rows2D(double x, double x_, const double a, const double b, const double *b_, long n)
	1247	{
	1248	__m256d avec0, avec1, avec2, avec3;
	1249
	1250	__m256d acc0 = _mm256_setzero_pd();
	1251	__m256d acc1 = _mm256_setzero_pd();
	1252	__m256d acc2 = _mm256_setzero_pd();
	1253	__m256d acc3 = _mm256_setzero_pd();
	1254
	1255	__m256d acc0_ = _mm256_setzero_pd();
	1256	__m256d acc1_ = _mm256_setzero_pd();
	1257	__m256d acc2_ = _mm256_setzero_pd();
	1258	__m256d acc3_ = _mm256_setzero_pd();
	1259
	1260
	1261	__m256d bvec;
	1262	__m256d bvec_;
	1263
	1264	for (long i = 0; i < n; i++) {
	1265	bvec = _mm256_broadcast_sd(&b[i]);
	1266	bvec_ = _mm256_broadcast_sd(&b_[i]);
	1267
	1268	avec0 = _mm256_load_pd(a); a += 4;
	1269	avec1 = _mm256_load_pd(a); a += 4;
	1270	avec2 = _mm256_load_pd(a); a += 4;
	1271	avec3 = _mm256_load_pd(a); a += 4;
	1272
	1273	#ifdef NTL_HAVE_FMA
	1274
	1275	acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
	1276	acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
	1277	acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
	1278	acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
	1279
	1280	acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_);
	1281	acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_);
	1282	acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_);
	1283	acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_);
	1284
	1285	#else
	1286	acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
	1287	acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
	1288	acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
	1289	acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
	1290
	1291	acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_);
	1292	acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_);
	1293	acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_);
	1294	acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_);
	1295
	1296	#endif
	1297
	1298	}
	1299
	1300	_mm256_store_pd(x + 0*4, acc0);
	1301	_mm256_store_pd(x + 1*4, acc1);
	1302	_mm256_store_pd(x + 2*4, acc2);
	1303	_mm256_store_pd(x + 3*4, acc3);
	1304
	1305	_mm256_store_pd(x_ + 0*4, acc0_);
	1306	_mm256_store_pd(x_ + 1*4, acc1_);
	1307	_mm256_store_pd(x_ + 2*4, acc2_);
	1308	_mm256_store_pd(x_ + 3*4, acc3_);
	1309	}
	1310
	1311
	1312	#endif
	1313
	1314
	1315
	1316	static
	1317	void InnerProduct_LL(zz_pX& x, const vec_zz_p& v, long low, long high,
	1318	const zz_pXAltArgument& H, long n)
	1319	{
	1320	high = min(high, v.length()-1);
	1321	long len = high-low+1;
	1322	if (len <= 0) {
	1323	clear(x);
	1324	return;
	1325	}
	1326
	1327	x.rep.SetLength(n);
	1328	zz_p *xp = x.rep.elts();
	1329
	1330	long p = zz_p::modulus();
	1331	sp_ll_reduce_struct pinv = H.pinv_LL;
	1332
	1333	const zz_p *vp = v.elts() + low;
	1334
	1335	for (long i = 0; i < n; i++)
	1336	xp[i].LoopHole() = InnerProd_LL(H.row[i], vp, len, p, pinv);
	1337
	1338	x.normalize();
	1339	}
	1340
	1341	static
	1342	void CompMod_LL(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	1343	const zz_pXModulus& F)
	1344	{
	1345	if (deg(g) <= 0) {
	1346	x = g;
	1347	return;
	1348	}
	1349
	1350
	1351	zz_pX s, t;
	1352
	1353	long m = A.m;
	1354	long l = ((g.rep.length()+m-1)/m) - 1;
	1355
	1356	zz_pXMultiplier M;
	1357	build(M, A.orig->H[m], F);
	1358
	1359	InnerProduct_LL(t, g.rep, lm, lm + m - 1, A, F.n);
	1360	for (long i = l-1; i >= 0; i--) {
	1361	InnerProduct_LL(s, g.rep, im, im + m - 1, A, F.n);
	1362	MulMod(t, t, M, F);
	1363	add(t, t, s);
	1364	}
	1365
	1366	x = t;
	1367	}
	1368
	1369	static
	1370	void InnerProduct_L(zz_pX& x, const vec_zz_p& v, long low, long high,
	1371	const zz_pXAltArgument& H, long n)
	1372	{
	1373	high = min(high, v.length()-1);
	1374	long len = high-low+1;
	1375	if (len <= 0) {
	1376	clear(x);
	1377	return;
	1378	}
	1379
	1380	x.rep.SetLength(n);
	1381	zz_p *xp = x.rep.elts();
	1382
	1383	long p = zz_p::modulus();
	1384	sp_reduce_struct pinv = H.pinv_L;
	1385
	1386
	1387	const zz_p *vp = v.elts() + low;
	1388
	1389	for (long i = 0; i < n; i++)
	1390	xp[i].LoopHole() = InnerProd_L(H.row[i], vp, len, p, pinv);
	1391
	1392	x.normalize();
	1393	}
	1394
	1395	static
	1396	void CompMod_L(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	1397	const zz_pXModulus& F)
	1398	{
	1399	if (deg(g) <= 0) {
	1400	x = g;
	1401	return;
	1402	}
	1403
	1404
	1405	zz_pX s, t;
	1406
	1407	long m = A.m;
	1408	long l = ((g.rep.length()+m-1)/m) - 1;
	1409
	1410	zz_pXMultiplier M;
	1411	build(M, A.orig->H[m], F);
	1412
	1413	InnerProduct_L(t, g.rep, lm, lm + m - 1, A, F.n);
	1414	for (long i = l-1; i >= 0; i--) {
	1415	InnerProduct_L(s, g.rep, im, im + m - 1, A, F.n);
	1416	MulMod(t, t, M, F);
	1417	add(t, t, s);
	1418	}
	1419
	1420	x = t;
	1421	}
	1422
	1423
	1424	#ifdef NTL_HAVE_AVX
	1425
	1426	static
	1427	void InnerProduct_AVX(zz_pX& x, const Vec<double>& v, long low, long high,
	1428	const zz_pXAltArgument& H, long n)
	1429	{
	1430	high = min(high, v.length()-1);
	1431	long len = high-low+1;
	1432	if (len <= 0) {
	1433	clear(x);
	1434	return;
	1435	}
	1436
	1437	x.rep.SetLength(n);
	1438	zz_p *xp = x.rep.elts();
	1439
	1440	long p = zz_p::modulus();
	1441	sp_reduce_struct pinv = H.pinv_L;
	1442
	1443
	1444	const double *vp = v.elts() + low;
	1445
	1446	NTL_AVX_LOCAL_ARRAY(res, double, 16);
	1447
	1448	long npanels = H.drow.length();
	1449
	1450	for (long i = 0, first = 0; i < npanels; i++, first += 16) {
	1451	mul16rowsD(res, H.drow[i], vp, len);
	1452	long last = min(n, first + 16);
	1453	for (long ii = first; ii < last; ii++)
	1454	xp[ii].LoopHole() = rem((unsigned long) (long) res[ii-first], p, pinv);
	1455	}
	1456
	1457	x.normalize();
	1458	}
	1459
	1460	static
	1461	void InnerProduct2_AVX(zz_pX& x, zz_pX& x_, const Vec<double>& v, long low, long low_, long len,
	1462	const zz_pXAltArgument& H, long n)
	1463	{
	1464	x.rep.SetLength(n);
	1465	zz_p *xp = x.rep.elts();
	1466
	1467	x_.rep.SetLength(n);
	1468	zz_p *xp_ = x_.rep.elts();
	1469
	1470	long p = zz_p::modulus();
	1471	sp_reduce_struct pinv = H.pinv_L;
	1472
	1473
	1474	const double *vp = v.elts() + low;
	1475	const double *vp_ = v.elts() + low_;
	1476
	1477	NTL_AVX_LOCAL_ARRAY(res, double, 16);
	1478	NTL_AVX_LOCAL_ARRAY(res_, double, 16);
	1479
	1480	long npanels = H.drow.length();
	1481
	1482	for (long i = 0, first = 0; i < npanels; i++, first += 16) {
	1483	mul16rows2D(res, res_, H.drow[i], vp, vp_, len);
	1484	long last = min(n, first + 16);
	1485	for (long ii = first; ii < last; ii++) {
	1486	xp[ii].LoopHole() = rem((unsigned long) (long) res[ii-first], p, pinv);
	1487	xp_[ii].LoopHole() = rem((unsigned long) (long) res_[ii-first], p, pinv);
	1488	}
	1489	}
	1490
	1491	x.normalize();
	1492	x_.normalize();
	1493	}
	1494
	1495	static
	1496	void CompMod_AVX(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	1497	const zz_pXModulus& F)
	1498	{
	1499	if (deg(g) <= 0) {
	1500	x = g;
	1501	return;
	1502	}
	1503
	1504
	1505	zz_pX s, s_, t;
	1506
	1507	long m = A.m;
	1508	long l = ((g.rep.length()+m-1)/m) - 1;
	1509
	1510	zz_pXMultiplier M;
	1511	build(M, A.orig->H[m], F);
	1512
	1513	long len = g.rep.length();
	1514	Vec<double> gg;
	1515	gg.SetLength(len);
	1516	for (long i = 0; i < len; i++) gg[i] = rep(g.rep[i]);
	1517
	1518	InnerProduct_AVX(t, gg, lm, lm + m - 1, A, F.n);
	1519	long i = l-1;
	1520	for (; i >= 1; i -= 2) {
	1521	InnerProduct2_AVX(s, s_, gg, im, (i-1)m, m, A, F.n);
	1522	MulMod(t, t, M, F);
	1523	add(t, t, s);
	1524	MulMod(t, t, M, F);
	1525	add(t, t, s_);
	1526	}
	1527
	1528	if (i >= 0) {
	1529	InnerProduct_AVX(s, gg, im, im + m - 1, A, F.n);
	1530	MulMod(t, t, M, F);
	1531	add(t, t, s);
	1532	}
	1533
	1534	x = t;
	1535	}
	1536	#endif
	1537
	1538
	1539
	1540	#endif
	1541
	1542
	1543
	1544	void CompMod(zz_pX& x, const zz_pX& g, const zz_pXAltArgument& A,
	1545	const zz_pXModulus& F)
	1546	{
	1547	if (!A.orig) LogicError("CompMod: uninitialized arg");
	1548
	1549	#ifndef NTL_HAVE_LL_TYPE
	1550	CompMod(x, g, *A.orig, F);
	1551	#else
	1552
	1553	switch (A.strategy) {
	1554	case 0:
	1555	CompMod(x, g, *A.orig, F);
	1556	break;
	1557
	1558	case 1:
	1559	CompMod_L(x, g, A, F);
	1560	break;
	1561
	1562	case 2:
	1563	CompMod_LL(x, g, A, F);
	1564	break;
	1565
	1566	#ifdef NTL_HAVE_AVX
	1567	case 3:
	1568	CompMod_AVX(x, g, A, F);
	1569	break;
	1570
	1571	#endif
	1572
	1573	default:
	1574	LogicError("CompMod: bad strategy");
	1575	}
	1576	#endif
	1577
	1578	}
	1579
	1580
	1581
	1582	// END zz_pXAltArgument variation
	1583
	1584
	1585
1068	1586
1069	1587	static void StripZeroes(vec_zz_p& x)
1070	1588	{

+60

-69

src/lzz_pXFactoring.c less more

0
1	0
2	1	#include <NTL/lzz_pXFactoring.h>
3		#include <NTL/vec_vec_lzz_p.h>
	2	#include <NTL/mat_lzz_p.h>
4	3	#include <NTL/FacVec.h>
5	4
6	5	#include <NTL/new.h>

143	142
144	143
145	144	static
146		void BuildMatrix(vec_vec_zz_p& M,
	145	void BuildMatrix(mat_zz_p& M,
147	146	long n, const zz_pX& g, const zz_pXModulus& F, long verbose)
148	147	{
149		long i, j, m;
150	148	zz_pXMultiplier G;
151	149	zz_pX h;
152	150
153		M.SetLength(n);
154		for (i = 0; i < n; i++)
155		M[i].SetLength(n);
	151	M.SetDims(n, n);
156	152
157	153	build(G, g, F);
158	154
159	155	set(h);
160		for (j = 0; j < n; j++) {
161		if (verbose && j % 10 == 0) cerr << "+";
162
163		m = deg(h);
164		for (i = 0; i < n; i++) {
165		if (i <= m)
166		M[i][j] = h.rep[i];
167		else
168		clear(M[i][j]);
169		}
170
171		if (j < n-1)
	156	for (long i = 0; i < n; i++) {
	157	if (verbose && i % 10 == 0) cerr << "+";
	158
	159	VectorCopy(M[i], h, n);
	160
	161	if (i < n-1)
172	162	MulMod(h, h, G, F);
173	163	}
174	164
175		for (i = 0; i < n; i++)
	165	for (long i = 0; i < n; i++)
176	166	add(M[i][i], M[i][i], -1);
177	167
178	168	}

225	215
226	216
227	217	static
228		void RandomBasisElt(zz_pX& g, const vec_long& D,
229		const vec_vec_zz_p& M)
230		{
231		zz_p t1, t2;
232
233		long n = D.length();
234
235		long i, j, s;
236
237		g.rep.SetLength(n);
238
239		vec_zz_p& v = g.rep;
240
241		for (j = n-1; j >= 0; j--) {
242		if (D[j] == -1)
243		random(v[j]);
244		else {
245		i = D[j];
246
247		// v[j] = sum_{s=j+1}^{n-1} v[s]*M[i,s]
248
249		clear(t1);
250
251		for (s = j+1; s < n; s++) {
252		mul(t2, v[s], M[i][s]);
253		add(t1, t1, t2);
254		}
255
256		v[j] = t1;
257		}
258		}
259
	218	void RandomBasisElt(zz_pX& g, mat_zz_p& ker)
	219	{
	220	long r = ker.NumRows();
	221	long n = ker.NumCols();
	222
	223	vec_zz_p v;
	224	v.SetLength(r);
	225	for (long i = 0; i < r; i++) random(v[i]);
	226
	227	mul(g.rep, v, ker);
260	228	g.normalize();
261	229	}
262	230

386	354	PowerXMod(g, p, F);
387	355	if (verbose) { cerr << (GetTime()-t) << "\n"; }
388	356
389		vec_long D;
390		long r;
391
392		vec_vec_zz_p M;
	357	mat_zz_p M, ker;
393	358
394	359	if (verbose) { cerr << "building matrix..."; t = GetTime(); }
395	360	BuildMatrix(M, n, g, F, verbose);
396	361	if (verbose) { cerr << (GetTime()-t) << "\n"; }
397	362
398	363	if (verbose) { cerr << "diagonalizing..."; t = GetTime(); }
399		NullSpace(r, D, M, verbose);
	364	kernel(ker, M);
400	365	if (verbose) { cerr << (GetTime()-t) << "\n"; }
401	366
	367
	368	M.kill();
	369
	370	long r = ker.NumRows();
402	371
403	372	if (verbose) cerr << "number of factors = " << r << "\n";
404	373

412	381
413	382	vec_zz_p roots;
414	383
415		RandomBasisElt(g, D, M);
	384	RandomBasisElt(g, ker);
416	385	MinPolyMod(h, g, F, r);
417		if (deg(h) == r) M.kill();
418	386	FindRoots(roots, h);
419	387	FindFactors(factors, f, g, roots);
420	388

424	392
425	393	while (factors.length() < r) {
426	394	if (verbose) cerr << "+";
427		RandomBasisElt(g, D, M);
	395	RandomBasisElt(g, ker);
428	396	S.kill();
429	397	for (i = 0; i < factors.length(); i++) {
430	398	const zz_pX& f = factors[i];

673	641	return !IsX(s);
674	642	}
675	643
676		NTL_THREAD_LOCAL long zz_pX_BlockingFactor = 10;
	644	NTL_CHEAP_THREAD_LOCAL long zz_pX_BlockingFactor = 10;
677	645
678	646	void DDF(vec_pair_zz_pX_long& factors, const zz_pX& ff, const zz_pX& hh,
679	647	long verbose)

1529	1497
1530	1498	/*********** NEW DDF **************/
1531	1499
1532		NTL_THREAD_LOCAL long zz_pX_GCDTableSize = 4;
1533		NTL_THREAD_LOCAL static vec_zz_pX *BabyStepFile = 0;
1534		NTL_THREAD_LOCAL static vec_zz_pX *GiantStepFile = 0;
1535		NTL_THREAD_LOCAL static zz_pXArgument *HHH = 0;
1536		NTL_THREAD_LOCAL static long OldN = 0;
	1500	NTL_CHEAP_THREAD_LOCAL long zz_pX_GCDTableSize = 4;
	1501	static NTL_CHEAP_THREAD_LOCAL vec_zz_pX *BabyStepFile = 0;
	1502	static NTL_CHEAP_THREAD_LOCAL vec_zz_pX *GiantStepFile = 0;
	1503	static NTL_CHEAP_THREAD_LOCAL zz_pXArgument *HHH = 0;
	1504	static NTL_CHEAP_THREAD_LOCAL zz_pXAltArgument *HHH1 = 0;
	1505	static NTL_CHEAP_THREAD_LOCAL long OldN = 0;
1537	1506
1538	1507
1539	1508	static

1567	1536	else {
1568	1537	zz_pXArgument H;
1569	1538	build(H, h, F, 2*rootn);
	1539
	1540	zz_pXAltArgument H1;
	1541	build(H1, H, F);
1570	1542
1571	1543
1572	1544	for (i = 1; i <= k-1; i++) {
1573	1545	(*BabyStepFile)(i) = h1;
1574	1546
1575		CompMod(h1, h1, H, F);
1576		if (verbose) cerr << "+";
	1547	CompMod(h1, h1, H1, F);
	1548	if (verbose) cerr << ".";
1577	1549	}
1578	1550	}
1579	1551

1591	1563	build(F, f);
1592	1564
1593	1565	build(HHH, h, F, 2SqrRoot(F.n));
	1566	build(HHH1, HHH, F);
1594	1567
1595	1568	OldN = F.n;
1596	1569

1677	1650	rem(last, last, F);
1678	1651	for (long i = 0; i < (*HHH).H.length(); i++)
1679	1652	rem((HHH).H[i], (HHH).H[i], F);
	1653	build(HHH1, HHH, F);
1680	1654	OldN = F.n;
1681	1655	}
1682	1656
1683	1657	(*GiantStepFile).SetLength(l+1);
1684		CompMod((GiantStepFile)(l+1), last, HHH, F);
	1658	CompMod((GiantStepFile)(l+1), last, HHH1, F);
1685	1659	g = (*GiantStepFile)(l+1);
1686	1660	}
1687	1661	else if (deg((*GiantStepFile)(gs)) >= F.n)

1910	1884	}
1911	1885
1912	1886	long B = deg(f)/2;
	1887
1913	1888	long k = SqrRoot(B);
	1889
	1890	// we double the number of baby steps if it seems like
	1891	// baby steps are significantly cheaper than giant steps.
	1892	// The calculations below are closely tied to a test in GenerateBabySteps:
	1893	// if nbm >= sdf/2, then scale should be 1 (baby steps and giant steps balanced)
	1894	if (B >= 500) {
	1895	long sdf = SqrRoot(deg(f));
	1896	long nbm = NumBits(zz_p::modulus());
	1897	double scale = 0.25*double(sdf)/double(nbm);
	1898	if (scale < 1) scale = 1;
	1899	if (scale > 2) scale = 2;
	1900	k = long(scale*k);
	1901	}
	1902
1914	1903	long l = (B+k-1)/k;
1915	1904
1916	1905	vec_zz_pX local_BabyStepFile;
1917	1906	vec_zz_pX local_GiantStepFile;
1918	1907	zz_pXArgument local_HHH;
	1908	zz_pXAltArgument local_HHH1;
1919	1909
1920	1910	BabyStepFile = &local_BabyStepFile;
1921	1911	GiantStepFile = &local_GiantStepFile;
1922	1912	HHH = &local_HHH;
	1913	HHH1 = &local_HHH1;
1923	1914
1924	1915	zz_pX h1;
1925	1916	GenerateBabySteps(h1, f, h, k, verbose);

-2

src/mach_desc.win less more

15	15	#define NTL_WIDE_DOUBLE_DP ((wide_double(1L<<52)))
16	16	#define NTL_QUAD_FLOAT_SPLIT ((((double)(1L<<27)))+1.0)
17	17	#define NTL_EXT_DOUBLE (0)
18
19
	18	#define NTL_FMA_DETECTED (1)
20	19
21	20
22	21

+46

-32

src/makefile less more

12	12
13	13	CXXFLAGS=-g -O2
14	14	# Flags for the C++ compiler
	15
	16	CXXAUTOFLAGS=
	17	# Flags for the C++ compiler, automatically generated by configuration script
15	18
16	19
17	20	AR=ar

69	72
70	73	GMP_OPT_INCDIR=# -I$(GMP_INCDIR) # GMPI
71	74	GMP_OPT_LIBDIR=# -L$(GMP_LIBDIR) # GMPL
72		GMP_OPT_LIB=# -lgmp # GMP
	75	GMP_OPT_LIB=-lgmp # GMP
73	76	# uncomment these if using GMP
74	77
75	78

136	139	O16=$(O15)
137	140	O17=$(O16)
138	141	O18=$(O17) xdouble.o
139		O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
	142	O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140	143
141	144	OBJ=$(O19)
142	145

161	164	S16=$(S15)
162	165	S17=$(S16)
163	166	S18=$(S17) xdouble.c
164		S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
	167	S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165	168
166	169	SRC = $(S19)
167	170

193	196	IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194	197	IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195	198	IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196		IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197		IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199		INCL=$(IN20)
	199	IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
	200	INCL=$(IN19)
200	201
201	202
202	203

212	213	# test source files
213	214
214	215	TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215		TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
	216	TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216	217	TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217	218	TS4=$(TS3) ThreadTest.c ExceptionTest.c
218	219	TS = $(TS4)
219	220
220	221	# scripts
221	222
222		SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
	223	SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223	224	SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224	225
225	226	SCRIPTS=$(SCRIPTS2)
226	227
227	228	# auxilliary source
228	229
229		MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230		GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
	230	MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
	231	GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231	232	GP=GetPID1.c GetPID2.c TestGetPID.c
232		CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
	233	CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
	234
	235	AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233	236
234	237
235	238
236	239	# documentation
237	240
238	241
239		D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
	242	D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240	243	D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241	244	D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242	245	D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt

252	255	D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253	256	D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254	257	D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255		D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
	258	D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256	259	D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257	260
258	261	TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt

263	266	TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264	267	TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265	268	TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266		TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
	269	TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267	270
268	271	TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269	272

275	278	HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276	279	HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277	280	HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278		HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
	281	HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279	282
280	283	HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281	284

287	290	# test program executables
288	291
289	292	PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290		PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
	293	PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291	294	PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292	295	PROGS = $(PROG3)
293	296
294	297	# things to save to a tar file
295	298
296	299	SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297		SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
	300	SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298	301	SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299	302	SFILES=$(SFI3)
300	303

309	312	NTL_INCLUDE = -I../include -I.
310	313	# NTL needs this to find its include files
311	314
312		COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314		LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
	315	COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
	316
	317	LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315	318
316	319
317	320

341	344	# setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342	345
343	346	setup2:
	347	echo "* CheckFeature log *" > CheckFeature.log
344	348	sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345	349	sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346		sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347		sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
	350	sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
	351	sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
	352	sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
	353	sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348	354
349	355	# setup3 generates the file ../include/NTL/gmp_aux.h
350	356	# The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h

382	388	GetPID.o: GetPID.c
383	389	$(LCOMP) $(COMPILE) GetPID.c
384	390
385		CheckPCLMUL: CheckPCLMUL.c
386		$(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
	391	CheckCompile: CheckCompile.c
	392	$(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
	393
387	394
388	395	.c.o:
389	396	$(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<

460	467
461	468	clobber:
462	469	rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463		cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464		cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
	470	sh ResetFeatures '..'
465	471	rm -f ../include/NTL/gmp_aux.h
466		sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
	472	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467	473	rm -f *.o
468	474	rm -rf small
469	475	rm -f cfileout mfileout

471	477	rm -f all
472	478
473	479	clean:
474		sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
	480	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475	481	rm -f *.o
476	482	rm -rf small
477	483	# - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR

497	503
498	504
499	505	package:
	506	./configure --nowrite
	507	cp mfileout def_makefile
	508	cp cfileout ../include/NTL/def_config.h
500	509	sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501	510	rm -rf `cat DIRNAME`
502	511	rm -f `cat DIRNAME`.tar

508	517	rm -rf `cat DIRNAME`
509	518
510	519	winpack:
	520	./configure --nowrite NTL_GMP_LIP=off
	521	cp mfileout def_makefile
	522	cp cfileout ../include/NTL/def_config.h
511	523	sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512	524	rm -rf `cat WINDIR`
513	525	rm -f `cat WINDIR`.zip

526	538
527	539	WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528	540	WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529		WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
	541	WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530	542
531	543	WOBJ = $(WO3)
532	544

538	550	MulTimeTest:
539	551	$(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540	552
541		PolyTimeTest:
542		$(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543	553
544	554	Poly1TimeTest:
545	555	$(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	556	Poly2TimeTest:
	557	$(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	558	Poly3TimeTest:
	559	$(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546	560
547	561
548	562	GF2XTimeTest:

+20

-3

src/mat_GF2.c less more

314	314
315	315
316	316
317		void solve(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b)
	317	static
	318	void solve_impl(ref_GF2 d, vec_GF2& X, const mat_GF2& A, const vec_GF2& b, bool trans)
318	319
319	320	{
320	321	long n = A.NumRows();

335	336	mat_GF2 M;
336	337	M.SetDims(n, n+1);
337	338
338		for (i = 0; i < n; i++) {
339		AddToCol(M, i, A[i]);
	339	if (trans) {
	340	for (i = 0; i < n; i++) {
	341	AddToCol(M, i, A[i]);
	342	}
	343	}
	344	else {
	345	for (i = 0; i < n; i++) {
	346	VectorCopy(M[i], A[i], n+1);
	347	}
340	348	}
341	349
342	350	AddToCol(M, n, b);

397	405	return;
398	406	}
399	407
	408	void solve(ref_GF2 d, vec_GF2& x, const mat_GF2& A, const vec_GF2& b)
	409	{
	410	solve_impl(d, x, A, b, true);
	411	}
	412
	413	void solve(ref_GF2 d, const mat_GF2& A, vec_GF2& x, const vec_GF2& b)
	414	{
	415	solve_impl(d, x, A, b, false);
	416	}
400	417
401	418
402	419	void inv(ref_GF2 d, mat_GF2& X, const mat_GF2& A)

+18

-4

src/mat_GF2E.c less more

281	281	}
282	282
283	283
284		void solve(GF2E& d, vec_GF2E& X,
285		const mat_GF2E& A, const vec_GF2E& b)
	284	static
	285	void solve_impl(GF2E& d, vec_GF2E& X, const mat_GF2E& A, const vec_GF2E& b, bool trans)
286	286
287	287	{
288	288	long n = A.NumRows();

310	310
311	311	for (i = 0; i < n; i++) {
312	312	M[i].SetSize(n+1, 2*GF2E::WordLength());
313		for (j = 0; j < n; j++)
314		M[i][j] = rep(A[j][i]);
	313
	314	if (trans)
	315	for (j = 0; j < n; j++) M[i][j] = rep(A[j][i]);
	316	else
	317	for (j = 0; j < n; j++) M[i][j] = rep(A[i][j]);
	318
315	319	M[i][n] = rep(b[i]);
316	320	}
317	321

377	381	}
378	382
379	383	conv(d, det);
	384	}
	385
	386	void solve(GF2E& d, vec_GF2E& x, const mat_GF2E& A, const vec_GF2E& b)
	387	{
	388	solve_impl(d, x, A, b, true);
	389	}
	390
	391	void solve(GF2E& d, const mat_GF2E& A, vec_GF2E& x, const vec_GF2E& b)
	392	{
	393	solve_impl(d, x, A, b, false);
380	394	}
381	395
382	396	void inv(GF2E& d, mat_GF2E& X, const mat_GF2E& A)

+18

-4

src/mat_ZZ_p.c less more

313	313	}
314	314
315	315
316		void solve(ZZ_p& d, vec_ZZ_p& X,
317		const mat_ZZ_p& A, const vec_ZZ_p& b)
	316	static
	317	void solve_impl(ZZ_p& d, vec_ZZ_p& X, const mat_ZZ_p& A, const vec_ZZ_p& b, bool trans)
318	318
319	319	{
320	320	long n = A.NumRows();

344	344
345	345	for (i = 0; i < n; i++) {
346	346	M[i].SetSize(n+1, t1.size());
347		for (j = 0; j < n; j++)
348		M[i][j] = rep(A[j][i]);
	347
	348	if (trans)
	349	for (j = 0; j < n; j++) M[i][j] = rep(A[j][i]);
	350	else
	351	for (j = 0; j < n; j++) M[i][j] = rep(A[i][j]);
	352
349	353	M[i][n] = rep(b[i]);
350	354	}
351	355

413	417	}
414	418
415	419	conv(d, det);
	420	}
	421
	422	void solve(ZZ_p& d, vec_ZZ_p& x, const mat_ZZ_p& A, const vec_ZZ_p& b)
	423	{
	424	solve_impl(d, x, A, b, true);
	425	}
	426
	427	void solve(ZZ_p& d, const mat_ZZ_p& A, vec_ZZ_p& x, const vec_ZZ_p& b)
	428	{
	429	solve_impl(d, x, A, b, false);
416	430	}
417	431
418	432	void inv(ZZ_p& d, mat_ZZ_p& X, const mat_ZZ_p& A)

+23

-5

src/mat_ZZ_pE.c less more

315	315	}
316	316
317	317
318		void solve(ZZ_pE& d, vec_ZZ_pE& X,
319		const mat_ZZ_pE& A, const vec_ZZ_pE& b)
	318	static
	319	void solve_impl(ZZ_pE& d, vec_ZZ_pE& X, const mat_ZZ_pE& A, const vec_ZZ_pE& b, bool trans)
320	320
321	321	{
322	322	long n = A.NumRows();

344	344
345	345	for (i = 0; i < n; i++) {
346	346	M[i].SetLength(n+1);
347		for (j = 0; j < n; j++) {
348		M[i][j].rep.SetMaxLength(2*deg(p)-1);
349		M[i][j] = rep(A[j][i]);
	347	if (trans) {
	348	for (j = 0; j < n; j++) {
	349	M[i][j].rep.SetMaxLength(2*deg(p)-1);
	350	M[i][j] = rep(A[j][i]);
	351	}
	352	}
	353	else {
	354	for (j = 0; j < n; j++) {
	355	M[i][j].rep.SetMaxLength(2*deg(p)-1);
	356	M[i][j] = rep(A[i][j]);
	357	}
350	358	}
351	359	M[i][n].rep.SetMaxLength(2*deg(p)-1);
352	360	M[i][n] = rep(b[i]);

416	424	}
417	425
418	426	conv(d, det);
	427	}
	428
	429	void solve(ZZ_pE& d, vec_ZZ_pE& x, const mat_ZZ_pE& A, const vec_ZZ_pE& b)
	430	{
	431	solve_impl(d, x, A, b, true);
	432	}
	433
	434	void solve(ZZ_pE& d, const mat_ZZ_pE& A, vec_ZZ_pE& x, const vec_ZZ_pE& b)
	435	{
	436	solve_impl(d, x, A, b, false);
419	437	}
420	438
421	439	void inv(ZZ_pE& d, mat_ZZ_pE& X, const mat_ZZ_pE& A)

+7057

-815

src/mat_lzz_p.c less more

3	3	#include <NTL/vec_long.h>
4	4
5	5
	6	#include <NTL/BasicThreadPool.h>
	7
	8
	9
	10	#ifdef NTL_HAVE_AVX
	11	#include <immintrin.h>
	12	#endif
6	13
7	14	NTL_START_IMPL
	15
	16
	17	#define PAR_THRESH_SQ (200)
	18	#define PAR_THRESH (40000)
	19
	20
	21	// *******************************************************
	22	//
	23	// Matrix Window data structure: perhaps some day this
	24	// will be made public.
	25	//
	26	// *******************************************************
	27
	28	struct mat_window_zz_p {
	29	mat_zz_p &A;
	30	long r_offset;
	31	long c_offset;
	32	long nrows;
	33	long ncols;
	34
	35	mat_window_zz_p(mat_zz_p& _A) :
	36	A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
	37
	38	mat_window_zz_p(const mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
	39	A(w.A)
	40	{
	41	if (r1 < 0 \|\| c1 < 0 \|\| r2 < r1 \|\| c2 < c1 \|\| r2-r1 > w.nrows \|\| c2-c1 > w.ncols)
	42	LogicError("mat_window_zz_p: bad args");
	43
	44	r_offset = w.r_offset + r1;
	45	c_offset = w.c_offset + c1;
	46	nrows = r2-r1;
	47	ncols = c2-c1;
	48	}
	49
	50	zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
	51
	52	long NumRows() const { return nrows; }
	53	long NumCols() const { return ncols; }
	54
	55	};
	56
	57
	58	struct const_mat_window_zz_p {
	59	const mat_zz_p &A;
	60	long r_offset;
	61	long c_offset;
	62	long nrows;
	63	long ncols;
	64
	65	const_mat_window_zz_p(const mat_zz_p& _A) :
	66	A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
	67
	68	const_mat_window_zz_p(const mat_window_zz_p& w) :
	69	A(w.A), r_offset(w.r_offset), c_offset(w.c_offset), nrows(w.nrows), ncols(w.ncols) { }
	70
	71	const_mat_window_zz_p(const const_mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
	72	A(w.A)
	73	{
	74	if (r1 < 0 \|\| c1 < 0 \|\| r2 < r1 \|\| c2 < c1 \|\| r2-r1 > w.nrows \|\| c2-c1 > w.ncols)
	75	LogicError("const_mat_window_zz_p: bad args");
	76
	77	r_offset = w.r_offset + r1;
	78	c_offset = w.c_offset + c1;
	79	nrows = r2-r1;
	80	ncols = c2-c1;
	81	}
	82
	83	const zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
	84
	85	long NumRows() const { return nrows; }
	86	long NumCols() const { return ncols; }
	87
	88	};
	89
	90	void add(const mat_window_zz_p& X,
	91	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	92	{
	93	long n = A.NumRows();
	94	long m = A.NumCols();
	95
	96	if (B.NumRows() != n \|\| B.NumCols() != m)
	97	LogicError("matrix add: dimension mismatch");
	98
	99	if (X.NumRows() != n \|\| X.NumCols() != m)
	100	LogicError("matrix add: dimension mismatch");
	101
	102	long p = zz_p::modulus();
	103
	104	for (long i = 0; i < n; i++) {
	105	zz_p *x = X[i];
	106	const zz_p *a = A[i];
	107	const zz_p *b = B[i];
	108	for (long j = 0; j < m; j++) {
	109	x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
	110	}
	111	}
	112	}
	113
	114	void sub(const mat_window_zz_p& X,
	115	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	116	{
	117	long n = A.NumRows();
	118	long m = A.NumCols();
	119
	120	if (B.NumRows() != n \|\| B.NumCols() != m)
	121	LogicError("matrix sub: dimension mismatch");
	122
	123	if (X.NumRows() != n \|\| X.NumCols() != m)
	124	LogicError("matrix sub: dimension mismatch");
	125
	126	long p = zz_p::modulus();
	127
	128	for (long i = 0; i < n; i++) {
	129	zz_p *x = X[i];
	130	const zz_p *a = A[i];
	131	const zz_p *b = B[i];
	132	for (long j = 0; j < m; j++) {
	133	x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
	134	}
	135	}
	136	}
	137
	138
	139	void clear(const mat_window_zz_p& X)
	140	{
	141	long n = X.NumRows();
	142	long m = X.NumCols();
	143
	144	for (long i = 0; i < n; i++)
	145	for (long j = 0; j < m; j++)
	146	clear(X[i][j]);
	147	}
	148
	149
	150
	151	// ***********************************************************
	152
	153
	154
	155
8	156
9	157
10	158	void add(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)

16	164	LogicError("matrix add: dimension mismatch");
17	165
18	166	X.SetDims(n, m);
	167
	168	long p = zz_p::modulus();
19	169
20		long i, j;
21		for (i = 1; i <= n; i++)
22		for (j = 1; j <= m; j++)
23		add(X(i,j), A(i,j), B(i,j));
	170	for (long i = 0; i < n; i++) {
	171	zz_p *x = X[i].elts();
	172	const zz_p *a = A[i].elts();
	173	const zz_p *b = B[i].elts();
	174	for (long j = 0; j < m; j++) {
	175	x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
	176	}
	177	}
24	178	}
25	179
26	180	void sub(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)

32	186	LogicError("matrix sub: dimension mismatch");
33	187
34	188	X.SetDims(n, m);
	189
	190	long p = zz_p::modulus();
35	191
	192	for (long i = 0; i < n; i++) {
	193	zz_p *x = X[i].elts();
	194	const zz_p *a = A[i].elts();
	195	const zz_p *b = B[i].elts();
	196	for (long j = 0; j < m; j++) {
	197	x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
	198	}
	199	}
	200
	201	}
	202
	203
	204
	205
	206
	207	void diag(mat_zz_p& X, long n, zz_p d)
	208	{
	209	X.SetDims(n, n);
36	210	long i, j;
	211
37	212	for (i = 1; i <= n; i++)
38		for (j = 1; j <= m; j++)
39		sub(X(i,j), A(i,j), B(i,j));
40		}
41
42
43		// some local buffers
44
45		NTL_THREAD_LOCAL static vec_long mul_aux_vec;
46		NTL_THREAD_LOCAL static Vec<mulmod_precon_t> precon_vec;
47
48
49
50		static
51		void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
52		{
53		long n = A.NumRows();
54		long l = A.NumCols();
55		long m = B.NumCols();
56
57		if (l != B.NumRows())
58		LogicError("matrix mul: dimension mismatch");
59
60		X.SetDims(n, m);
61
62		if (m > 1) { // new preconditioning code
63
64		long p = zz_p::modulus();
65		mulmod_t pinv = zz_p::ModulusInverse();
66
67
68		vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
69		mul_aux_vec.SetLength(m);
70		long *acc = mul_aux_vec.elts();
71
72		long i, j, k;
73
74		for (i = 0; i < n; i++) {
75		const zz_p* ap = A[i].elts();
76
77		for (j = 0; j < m; j++) acc[j] = 0;
78
79		for (k = 0; k < l; k++) {
80		long aa = rep(ap[k]);
81		if (aa != 0) {
82		const zz_p* bp = B[k].elts();
83		long T1;
84		mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
85
86		for (j = 0; j < m; j++) {
87		T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
88		acc[j] = AddMod(acc[j], T1, p);
89		}
90		}
91		}
92
93		zz_p *xp = X[i].elts();
94		for (j = 0; j < m; j++)
95		xp[j].LoopHole() = acc[j];
96		}
97		}
98		else { // just use the old code, w/o preconditioning
99
100		long p = zz_p::modulus();
101		mulmod_t pinv = zz_p::ModulusInverse();
102
103		long i, j, k;
104		long acc, tmp;
105
106		for (i = 1; i <= n; i++) {
107		for (j = 1; j <= m; j++) {
108		acc = 0;
109		for(k = 1; k <= l; k++) {
110		tmp = MulMod(rep(A(i,k)), rep(B(k,j)), p, pinv);
111		acc = AddMod(acc, tmp, p);
112		}
113		X(i,j).LoopHole() = acc;
114		}
115		}
116
117		}
118		}
119
120		void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
121		{
122		if (&X == &A \|\| &X == &B) {
123		mat_zz_p tmp;
124		mul_aux(tmp, A, B);
125		X = tmp;
126		}
127		else
128		mul_aux(X, A, B);
129		}
130
131
132		void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
133		{
134		long l = a.length();
135		long m = B.NumCols();
136
137		if (l != B.NumRows())
138		LogicError("matrix mul: dimension mismatch");
139
140		if (m == 0) {
141
142		x.SetLength(0);
143
144		}
145		else if (m == 1) {
146
147		long p = zz_p::modulus();
148		mulmod_t pinv = zz_p::ModulusInverse();
149
150		long acc, tmp;
151		long k;
152
153		acc = 0;
154		for(k = 1; k <= l; k++) {
155		tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
156		acc = AddMod(acc, tmp, p);
157		}
158
159		x.SetLength(1);
160		x(1).LoopHole() = acc;
161
162		}
163		else { // m > 1. precondition
164
165
166		long p = zz_p::modulus();
167		mulmod_t pinv = zz_p::ModulusInverse();
168
169		vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
170		mul_aux_vec.SetLength(m);
171		long *acc = mul_aux_vec.elts();
172
173		long j, k;
174
175
176		const zz_p* ap = a.elts();
177
178		for (j = 0; j < m; j++) acc[j] = 0;
179
180		for (k = 0; k < l; k++) {
181		long aa = rep(ap[k]);
182		if (aa != 0) {
183		const zz_p* bp = B[k].elts();
184		long T1;
185		mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
186
187		for (j = 0; j < m; j++) {
188		T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
189		acc[j] = AddMod(acc[j], T1, p);
190		}
191		}
192		}
193
194		x.SetLength(m);
195		zz_p *xp = x.elts();
196		for (j = 0; j < m; j++)
197		xp[j].LoopHole() = acc[j];
198		}
199		}
200
201
202		void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
203		{
204		long n = A.NumRows();
205		long l = A.NumCols();
206
207		if (l != b.length())
208		LogicError("matrix mul: dimension mismatch");
209
210		x.SetLength(n);
211		zz_p* xp = x.elts();
212
213		long p = zz_p::modulus();
214		mulmod_t pinv = zz_p::ModulusInverse();
215
216		long i, k;
217		long acc, tmp;
218
219		const zz_p* bp = b.elts();
220
221		if (n <= 1) {
222
223		for (i = 0; i < n; i++) {
224		acc = 0;
225		const zz_p* ap = A[i].elts();
226
227		for (k = 0; k < l; k++) {
228		tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
229		acc = AddMod(acc, tmp, p);
230		}
231
232		xp[i].LoopHole() = acc;
233		}
234
235		}
236		else {
237
238		Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
239		precon_vec.SetLength(l);
240		mulmod_precon_t *bpinv = precon_vec.elts();
241
242		for (k = 0; k < l; k++)
243		bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
244
245		for (i = 0; i < n; i++) {
246		acc = 0;
247		const zz_p* ap = A[i].elts();
248
249		for (k = 0; k < l; k++) {
250		tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
251		acc = AddMod(acc, tmp, p);
252		}
253
254		xp[i].LoopHole() = acc;
255		}
256		}
257		}
258
259		void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
260		{
261		if (&b == &x \|\| A.position1(x) != -1) {
262		vec_zz_p tmp;
263		mul_aux(tmp, A, b);
264		x = tmp;
265		}
266		else
267		mul_aux(x, A, b);
268
269		}
270
271
272		void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
	213	for (j = 1; j <= n; j++)
	214	if (i == j)
	215	X(i, j) = d;
	216	else
	217	clear(X(i, j));
	218	}
	219
	220	long IsDiag(const mat_zz_p& A, long n, zz_p d)
	221	{
	222	if (A.NumRows() != n \|\| A.NumCols() != n)
	223	return 0;
	224
	225	long i, j;
	226
	227	for (i = 1; i <= n; i++)
	228	for (j = 1; j <= n; j++)
	229	if (i != j) {
	230	if (!IsZero(A(i, j))) return 0;
	231	}
	232	else {
	233	if (A(i, j) != d) return 0;
	234	}
	235
	236	return 1;
	237	}
	238
	239	void negate(mat_zz_p& X, const mat_zz_p& A)
273	240	{
274	241	long n = A.NumRows();
275	242	long m = A.NumCols();
276	243
	244
277	245	X.SetDims(n, m);
278	246
279		long i, j;
280
281		if (n == 0 \|\| m == 0 \|\| (n == 1 && m == 1)) {
282
283		for (i = 0; i < n; i++)
284		for (j = 0; j < m; j++)
285		mul(X[i][j], A[i][j], b);
286
287		}
288		else {
289
290		long p = zz_p::modulus();
291		mulmod_t pinv = zz_p::ModulusInverse();
292		long bb = rep(b);
293		mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
294
295		for (i = 0; i < n; i++) {
296		const zz_p *ap = A[i].elts();
297		zz_p *xp = X[i].elts();
298
299		for (j = 0; j < m; j++)
300		xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
301		}
302
303		}
304		}
305
306		void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
307		{
308		zz_p b;
309		b = b_in;
310		mul(X, A, b);
311		}
312
313
314
315
316
	247	long p = zz_p::modulus();
	248
	249	for (long i = 0; i < n; i++) {
	250	zz_p *x = X[i].elts();
	251	const zz_p *a = A[i].elts();
	252	for (long j = 0; j < m; j++) {
	253	x[j].LoopHole() = NegateMod(rep(a[j]), p);
	254	}
	255	}
	256	}
	257
	258	long IsZero(const mat_zz_p& a)
	259	{
	260	long n = a.NumRows();
	261	long i;
	262
	263	for (i = 0; i < n; i++)
	264	if (!IsZero(a[i]))
	265	return 0;
	266
	267	return 1;
	268	}
	269
	270	void clear(mat_zz_p& x)
	271	{
	272	long n = x.NumRows();
	273	long i;
	274	for (i = 0; i < n; i++)
	275	clear(x[i]);
	276	}
	277
317	278
318	279	void ident(mat_zz_p& X, long n)
319	280	{

329	290	}
330	291
331	292
332
333		void determinant(zz_p& d, const mat_zz_p& M_in)
334		{
335		long k, n;
336		long i, j;
337		long pos;
338		zz_p t1, t2, t3;
339		zz_p x, y;
340
341		mat_zz_p M;
342		M = M_in;
343
344		n = M.NumRows();
345
346		if (M.NumCols() != n)
347		LogicError("determinant: nonsquare matrix");
348
349		if (n == 0) {
350		set(d);
351		return;
352		}
353
354		zz_p det;
355
356		set(det);
357
358		long p = zz_p::modulus();
359		mulmod_t pinv = zz_p::ModulusInverse();
360
361		for (k = 0; k < n; k++) {
362		pos = -1;
363		for (i = k; i < n; i++) {
364		if (!IsZero(M[i][k])) {
365		pos = i;
366		break;
367		}
368		}
369
370		if (pos != -1) {
371		if (k != pos) {
372		swap(M[pos], M[k]);
373		negate(det, det);
374		}
375
376		mul(det, det, M[k][k]);
377
378		inv(t3, M[k][k]);
379
380		for (i = k+1; i < n; i++) {
381		// M[i] = M[i] - M[k]M[i,k]t3
382
383		mul(t1, M[i][k], t3);
384		negate(t1, t1);
385
386		x = M[i].elts() + (k+1);
387		y = M[k].elts() + (k+1);
388
389		long T1 = rep(t1);
390		mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
391		long T2;
392
393		for (j = k+1; j < n; j++, x++, y++) {
394		// x = x + (y)t1
395
396		T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
397		x->LoopHole() = AddMod(rep(*x), T2, p);
398		}
399		}
400		}
401		else {
402		clear(d);
403		return;
404		}
405		}
406
407		d = det;
408		}
409
410
411
412
413	293	long IsIdent(const mat_zz_p& A, long n)
414	294	{
415	295	if (A.NumRows() != n \|\| A.NumCols() != n)

461	341	}
462	342
463	343
464		void solve(zz_p& d, vec_zz_p& X,
465		const mat_zz_p& A, const vec_zz_p& b)
466
467		{
468		long n = A.NumRows();
469
470		if (A.NumCols() != n)
471		LogicError("solve: nonsquare matrix");
472
473
474		if (b.length() != n)
475		LogicError("solve: dimension mismatch");
476
477		if (n == 0) {
478		set(d);
479		X.SetLength(0);
480		return;
481		}
482
483		long i, j, k, pos;
484		zz_p t1, t2, t3;
485		zz_p x, y;
486
487		mat_zz_p M;
488		M.SetDims(n, n+1);
489		for (i = 0; i < n; i++) {
490		for (j = 0; j < n; j++)
491		M[i][j] = A[j][i];
492		M[i][n] = b[i];
493		}
494
495		zz_p det;
496		set(det);
497
498		long p = zz_p::modulus();
499		mulmod_t pinv = zz_p::ModulusInverse();
500
501		for (k = 0; k < n; k++) {
502		pos = -1;
503		for (i = k; i < n; i++) {
504		if (!IsZero(M[i][k])) {
505		pos = i;
506		break;
507		}
508		}
509
510		if (pos != -1) {
511		if (k != pos) {
512		swap(M[pos], M[k]);
513		negate(det, det);
514		}
515
516		mul(det, det, M[k][k]);
517
518		inv(t3, M[k][k]);
519		M[k][k] = t3;
520
521
522		for (i = k+1; i < n; i++) {
523		// M[i] = M[i] - M[k]M[i,k]t3
524
525		mul(t1, M[i][k], t3);
526		negate(t1, t1);
527
528		x = M[i].elts() + (k+1);
529		y = M[k].elts() + (k+1);
530
531		long T1 = rep(t1);
532		mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
533		long T2;
534
535		for (j = k+1; j <= n; j++, x++, y++) {
536		// x = x + (y)t1
537
538		T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
539		x->LoopHole() = AddMod(rep(*x), T2, p);
540		}
541		}
542		}
543		else {
544		clear(d);
545		return;
546		}
547		}
548
549		X.SetLength(n);
550		for (i = n-1; i >= 0; i--) {
551		clear(t1);
552		for (j = i+1; j < n; j++) {
553		mul(t2, X[j], M[i][j]);
554		add(t1, t1, t2);
555		}
556		sub(t1, M[i][n], t1);
557		mul(X[i], t1, M[i][i]);
558		}
559
560		d = det;
561		}
562
563		void inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A)
564		{
565		long n = A.NumRows();
566		if (A.NumCols() != n)
567		LogicError("inv: nonsquare matrix");
568
569		if (n == 0) {
570		set(d);
571		X.SetDims(0, 0);
572		return;
573		}
574
575		long i, j, k, pos;
576		zz_p t1, t2, t3;
577		zz_p x, y;
578
579		mat_zz_p M;
580		M.SetDims(n, 2*n);
581		for (i = 0; i < n; i++) {
582		for (j = 0; j < n; j++) {
583		M[i][j] = A[i][j];
584		clear(M[i][n+j]);
585		}
586		set(M[i][n+i]);
587		}
588
589		zz_p det;
590		set(det);
591
592		long p = zz_p::modulus();
593		mulmod_t pinv = zz_p::ModulusInverse();
594
595		for (k = 0; k < n; k++) {
596		pos = -1;
597		for (i = k; i < n; i++) {
598		if (!IsZero(M[i][k])) {
599		pos = i;
600		break;
601		}
602		}
603
604		if (pos != -1) {
605		if (k != pos) {
606		swap(M[pos], M[k]);
607		negate(det, det);
608		}
609
610		mul(det, det, M[k][k]);
611
612		inv(t3, M[k][k]);
613		M[k][k] = t3;
614
615		for (i = k+1; i < n; i++) {
616		// M[i] = M[i] - M[k]M[i,k]t3
617
618		mul(t1, M[i][k], t3);
619		negate(t1, t1);
620
621		x = M[i].elts() + (k+1);
622		y = M[k].elts() + (k+1);
623
624		long T1 = rep(t1);
625		mulmod_precon_t t1pinv = PrepMulModPrecon(T1, p, pinv); // T1*pinv;
626		long T2;
627
628		for (j = k+1; j < 2*n; j++, x++, y++) {
629		// x = x + (y)t1
630
631		T2 = MulModPrecon(rep(*y), T1, p, t1pinv);
632		x->LoopHole() = AddMod(rep(*x), T2, p);
633		}
634		}
635		}
636		else {
637		clear(d);
638		return;
639		}
640		}
641
642		X.SetDims(n, n);
643		for (k = 0; k < n; k++) {
644		for (i = n-1; i >= 0; i--) {
645		clear(t1);
646		for (j = i+1; j < n; j++) {
647		mul(t2, X[j][k], M[i][j]);
648		add(t1, t1, t2);
649		}
650		sub(t1, M[i][n+k], t1);
651		mul(X[i][k], t1, M[i][i]);
652		}
653		}
654
655		d = det;
656		}
657
658		long gauss(mat_zz_p& M, long w)
659		{
660		long k, l;
661		long i, j;
662		long pos;
663		zz_p t1, t2, t3;
664		zz_p x, y;
665
666		long n = M.NumRows();
667		long m = M.NumCols();
668
669		if (w < 0 \|\| w > m)
670		LogicError("gauss: bad args");
671
672		long p = zz_p::modulus();
673		mulmod_t pinv = zz_p::ModulusInverse();
674		long T1, T2;
675
676		l = 0;
677		for (k = 0; k < w && l < n; k++) {
678
679		pos = -1;
680		for (i = l; i < n; i++) {
681		if (!IsZero(M[i][k])) {
682		pos = i;
683		break;
684		}
685		}
686
687		if (pos != -1) {
688		swap(M[pos], M[l]);
689
690		inv(t3, M[l][k]);
691		negate(t3, t3);
692
693		for (i = l+1; i < n; i++) {
694		// M[i] = M[i] + M[l]M[i,k]t3
695
696		mul(t1, M[i][k], t3);
697
698		T1 = rep(t1);
699		mulmod_precon_t T1pinv = PrepMulModPrecon(T1, p, pinv);
700
701		clear(M[i][k]);
702
703		x = M[i].elts() + (k+1);
704		y = M[l].elts() + (k+1);
705
706		for (j = k+1; j < m; j++, x++, y++) {
707		// x = x + (y)t1
708
709		T2 = MulModPrecon(rep(*y), T1, p, T1pinv);
710		T2 = AddMod(T2, rep(*x), p);
711		(*x).LoopHole() = T2;
712		}
713		}
714
715		l++;
716		}
717		}
718
719		return l;
720		}
721
722		long gauss(mat_zz_p& M)
723		{
724		return gauss(M, M.NumCols());
725		}
726
727		void image(mat_zz_p& X, const mat_zz_p& A)
728		{
729		mat_zz_p M;
730		M = A;
731		long r = gauss(M);
732		M.SetDims(r, M.NumCols());
733		X = M;
734		}
735
736		void kernel(mat_zz_p& X, const mat_zz_p& A)
737		{
738		long m = A.NumRows();
739		long n = A.NumCols();
740
741		mat_zz_p M;
742		long r;
743
744		transpose(M, A);
745		r = gauss(M);
746
747		X.SetDims(m-r, m);
748
749		long i, j, k, s;
750		zz_p t1, t2;
751
752		vec_long D;
753		D.SetLength(m);
754		for (j = 0; j < m; j++) D[j] = -1;
755
756		vec_zz_p inverses;
757		inverses.SetLength(m);
758
759		j = -1;
760		for (i = 0; i < r; i++) {
761		do {
762		j++;
763		} while (IsZero(M[i][j]));
764
765		D[j] = i;
766		inv(inverses[j], M[i][j]);
767		}
768
769		for (k = 0; k < m-r; k++) {
770		vec_zz_p& v = X[k];
771		long pos = 0;
772		for (j = m-1; j >= 0; j--) {
773		if (D[j] == -1) {
774		if (pos == k)
775		set(v[j]);
776		else
777		clear(v[j]);
778		pos++;
779		}
780		else {
781		i = D[j];
782
783		clear(t1);
784
785		for (s = j+1; s < m; s++) {
786		mul(t2, v[s], M[i][s]);
787		add(t1, t1, t2);
788		}
789
790		mul(t1, t1, inverses[j]);
791		negate(v[j], t1);
792		}
793		}
794		}
795		}
796
797
798
799
800
801		void diag(mat_zz_p& X, long n, zz_p d)
802		{
803		X.SetDims(n, n);
804		long i, j;
805
806		for (i = 1; i <= n; i++)
807		for (j = 1; j <= n; j++)
808		if (i == j)
809		X(i, j) = d;
810		else
811		clear(X(i, j));
812		}
813
814		long IsDiag(const mat_zz_p& A, long n, zz_p d)
815		{
816		if (A.NumRows() != n \|\| A.NumCols() != n)
817		return 0;
818
819		long i, j;
820
821		for (i = 1; i <= n; i++)
822		for (j = 1; j <= n; j++)
823		if (i != j) {
824		if (!IsZero(A(i, j))) return 0;
825		}
826		else {
827		if (A(i, j) != d) return 0;
828		}
829
830		return 1;
831		}
832
833		void negate(mat_zz_p& X, const mat_zz_p& A)
834		{
835		long n = A.NumRows();
836		long m = A.NumCols();
837
838
839		X.SetDims(n, m);
840
841		long i, j;
842		for (i = 1; i <= n; i++)
843		for (j = 1; j <= m; j++)
844		negate(X(i,j), A(i,j));
845		}
846
847		long IsZero(const mat_zz_p& a)
848		{
849		long n = a.NumRows();
850		long i;
851
852		for (i = 0; i < n; i++)
853		if (!IsZero(a[i]))
854		return 0;
855
856		return 1;
857		}
858
859		void clear(mat_zz_p& x)
860		{
861		long n = x.NumRows();
862		long i;
863		for (i = 0; i < n; i++)
864		clear(x[i]);
865		}
866
867
868		mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
869		{
870		mat_zz_p res;
871		add(res, a, b);
872		NTL_OPT_RETURN(mat_zz_p, res);
873		}
874
875		mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
876		{
877		mat_zz_p res;
878		mul_aux(res, a, b);
879		NTL_OPT_RETURN(mat_zz_p, res);
880		}
881
882		mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
883		{
884		mat_zz_p res;
885		sub(res, a, b);
886		NTL_OPT_RETURN(mat_zz_p, res);
887		}
888
889
890		mat_zz_p operator-(const mat_zz_p& a)
891		{
892		mat_zz_p res;
893		negate(res, a);
894		NTL_OPT_RETURN(mat_zz_p, res);
895		}
896
897
898		vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
899		{
900		vec_zz_p res;
901		mul_aux(res, a, b);
902		NTL_OPT_RETURN(vec_zz_p, res);
903		}
904
905		vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
906		{
907		vec_zz_p res;
908		mul(res, a, b);
909		NTL_OPT_RETURN(vec_zz_p, res);
910		}
911
912		void inv(mat_zz_p& X, const mat_zz_p& A)
913		{
914		zz_p d;
915		inv(d, X, A);
916		if (d == 0) ArithmeticError("inv: non-invertible matrix");
917		}
918
919		void power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e)
	344
	345
	346	void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax)
920	347	{
921	348	if (A.NumRows() != A.NumCols()) LogicError("power: non-square matrix");
922	349

940	367	}
941	368
942	369	if (e < 0)
943		inv(X, T1);
	370	relaxed_inv(X, T1, relax);
944	371	else
945	372	X = T1;
946	373	}
947	374
	375
	376
	377	// ******************************************************************
	378	//
	379	// matrix-vector multiplication code
	380	//
	381	// ******************************************************************
	382
	383
	384
	385
	386
	387
	388	void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
	389	{
	390	long l = a.length();
	391	long m = B.NumCols();
	392
	393	if (l != B.NumRows())
	394	LogicError("matrix mul: dimension mismatch");
	395
	396	if (m == 0) {
	397
	398	x.SetLength(0);
	399
	400	}
	401	else if (m == 1) {
	402
	403	long p = zz_p::modulus();
	404	mulmod_t pinv = zz_p::ModulusInverse();
	405
	406	long acc, tmp;
	407	long k;
	408
	409	acc = 0;
	410	for(k = 1; k <= l; k++) {
	411	tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
	412	acc = AddMod(acc, tmp, p);
	413	}
	414
	415	x.SetLength(1);
	416	x(1).LoopHole() = acc;
	417
	418	}
	419	else { // m > 1. precondition and EXEC_RANGE
	420
	421
	422	long p = zz_p::modulus();
	423	mulmod_t pinv = zz_p::ModulusInverse();
	424
	425	NTL_TLS_LOCAL(vec_long, mul_aux_vec);
	426	vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
	427	mul_aux_vec.SetLength(m);
	428	long *acc = mul_aux_vec.elts();
	429
	430	const zz_p* ap = a.elts();
	431
	432	for (long j = 0; j < m; j++) acc[j] = 0;
	433
	434	const bool seq = double(l)*double(m) < PAR_THRESH;
	435
	436	NTL_GEXEC_RANGE(seq, m, first, last) {
	437
	438	for (long k = 0; k < l; k++) {
	439	long aa = rep(ap[k]);
	440	if (aa != 0) {
	441	const zz_p* bp = B[k].elts();
	442	long T1;
	443	mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
	444
	445	for (long j = first; j < last; j++) {
	446	T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
	447	acc[j] = AddMod(acc[j], T1, p);
	448	}
	449	}
	450	}
	451
	452	} NTL_GEXEC_RANGE_END
	453
	454	x.SetLength(m);
	455	zz_p *xp = x.elts();
	456	for (long j = 0; j < m; j++)
	457	xp[j].LoopHole() = acc[j];
	458	}
	459	}
	460
	461
	462	void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
	463	{
	464	long n = A.NumRows();
	465	long l = A.NumCols();
	466
	467	if (l != b.length())
	468	LogicError("matrix mul: dimension mismatch");
	469
	470	x.SetLength(n);
	471	zz_p* xp = x.elts();
	472
	473	long p = zz_p::modulus();
	474	const zz_p* bp = b.elts();
	475
	476	const bool seq = double(n)*double(l) < PAR_THRESH;
	477
	478
	479	#ifdef NTL_HAVE_LL_TYPE
	480
	481	if (cast_unsigned(l) <= (~(0UL))/cast_unsigned(p-1) &&
	482	cast_unsigned(l)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
	483
	484	sp_reduce_struct red_struct = zz_p::red_struct();
	485
	486	NTL_GEXEC_RANGE(seq, n, first, last) {
	487
	488	for (long i = first; i < last; i++) {
	489	xp[i].LoopHole() = InnerProd_L(A[i].elts(), bp, l, p, red_struct);
	490	}
	491
	492	} NTL_GEXEC_RANGE_END
	493	}
	494	else {
	495	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	496
	497	NTL_GEXEC_RANGE(seq, n, first, last) {
	498
	499	for (long i = first; i < last; i++) {
	500	xp[i].LoopHole() = InnerProd_LL(A[i].elts(), bp, l, p, ll_red_struct);
	501	}
	502
	503	} NTL_GEXEC_RANGE_END
	504
	505	}
	506
	507	#else
	508
	509	mulmod_t pinv = zz_p::ModulusInverse();
	510
	511	if (n <= 1) {
	512
	513	for (long i = 0; i < n; i++) {
	514	long acc = 0;
	515	const zz_p* ap = A[i].elts();
	516
	517	for (long k = 0; k < l; k++) {
	518	long tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
	519	acc = AddMod(acc, tmp, p);
	520	}
	521
	522	xp[i].LoopHole() = acc;
	523	}
	524
	525	}
	526	else {
	527
	528	NTL_TLS_LOCAL(Vec<mulmod_precon_t>, precon_vec);
	529	Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
	530	precon_vec.SetLength(l);
	531	mulmod_precon_t *bpinv = precon_vec.elts();
	532
	533	for (long k = 0; k < l; k++)
	534	bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
	535
	536
	537	NTL_GEXEC_RANGE(seq, n, first, last) {
	538	for (long i = first; i < last; i++) {
	539	long acc = 0;
	540	const zz_p* ap = A[i].elts();
	541
	542	for (long k = 0; k < l; k++) {
	543	long tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
	544	acc = AddMod(acc, tmp, p);
	545	}
	546
	547	xp[i].LoopHole() = acc;
	548	}
	549	} NTL_GEXEC_RANGE_END
	550
	551	}
	552
	553	#endif
	554	}
	555
	556	void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
	557	{
	558	if (&b == &x \|\| A.position1(x) != -1) {
	559	vec_zz_p tmp;
	560	mul_aux(tmp, A, b);
	561	x = tmp;
	562	}
	563	else
	564	mul_aux(x, A, b);
	565
	566	}
	567
	568
	569	void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
	570	{
	571	long n = A.NumRows();
	572	long m = A.NumCols();
	573
	574	X.SetDims(n, m);
	575
	576
	577	if (n == 0 \|\| m == 0 \|\| (n == 1 && m == 1)) {
	578	long i, j;
	579
	580	for (i = 0; i < n; i++)
	581	for (j = 0; j < m; j++)
	582	mul(X[i][j], A[i][j], b);
	583
	584	}
	585	else {
	586
	587	long p = zz_p::modulus();
	588	mulmod_t pinv = zz_p::ModulusInverse();
	589	long bb = rep(b);
	590	mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
	591
	592	const bool seq = double(n)*double(m) < PAR_THRESH;
	593
	594	NTL_GEXEC_RANGE(seq, n, first, last)
	595	long i, j;
	596	for (i = first; i < last; i++) {
	597	const zz_p *ap = A[i].elts();
	598	zz_p *xp = X[i].elts();
	599
	600	for (j = 0; j < m; j++)
	601	xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
	602	}
	603	NTL_GEXEC_RANGE_END
	604
	605
	606	}
	607	}
	608
	609	void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
	610	{
	611	zz_p b;
	612	b = b_in;
	613	mul(X, A, b);
	614	}
	615
	616
	617	// ******************************************************************
	618	//
	619	// Code shared by block-matrix code
	620	//
	621	// ******************************************************************
	622
	623	#define MAT_BLK_SZ (32)
	624
	625
	626	#ifdef NTL_HAVE_LL_TYPE
	627
	628	#ifdef NTL_HAVE_AVX
	629
	630	#define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
	631	// max int representable exactly as a double
	632	// this assumes NTL_DBL_PRECISION <= NTL_BITS_PER_LONG-2, which is
	633	// checked in the code that tests for HAVE_AVX, but we check it here as
	634	// well
	635
	636	#if (NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2)
	637	#error "NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2"
	638	#endif
	639
	640
	641	// MUL_ADD(a, b, c): a += b*c
	642	#ifdef NTL_HAVE_FMA
	643	#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
	644	#else
	645	#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
	646	#endif
	647
	648	#if 0
	649	static
	650	void muladd1_by_32(double x, const double a, const double *b, long n)
	651	{
	652	__m256d avec, bvec;
	653
	654
	655	__m256d acc0=_mm256_load_pd(x + 0*4);
	656	__m256d acc1=_mm256_load_pd(x + 1*4);
	657	__m256d acc2=_mm256_load_pd(x + 2*4);
	658	__m256d acc3=_mm256_load_pd(x + 3*4);
	659	__m256d acc4=_mm256_load_pd(x + 4*4);
	660	__m256d acc5=_mm256_load_pd(x + 5*4);
	661	__m256d acc6=_mm256_load_pd(x + 6*4);
	662	__m256d acc7=_mm256_load_pd(x + 7*4);
	663
	664
	665	for (long i = 0; i < n; i++) {
	666	avec = _mm256_broadcast_sd(a); a++;
	667
	668
	669	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
	670	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
	671	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
	672	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
	673	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
	674	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
	675	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
	676	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
	677	}
	678
	679
	680	_mm256_store_pd(x + 0*4, acc0);
	681	_mm256_store_pd(x + 1*4, acc1);
	682	_mm256_store_pd(x + 2*4, acc2);
	683	_mm256_store_pd(x + 3*4, acc3);
	684	_mm256_store_pd(x + 4*4, acc4);
	685	_mm256_store_pd(x + 5*4, acc5);
	686	_mm256_store_pd(x + 6*4, acc6);
	687	_mm256_store_pd(x + 7*4, acc7);
	688	}
	689
	690	#else
	691
	692	static
	693	void muladd1_by_32(double x, const double a, const double *b, long n)
	694	{
	695	__m256d acc0=_mm256_load_pd(x + 0*4);
	696	__m256d acc1=_mm256_load_pd(x + 1*4);
	697	__m256d acc2=_mm256_load_pd(x + 2*4);
	698	__m256d acc3=_mm256_load_pd(x + 3*4);
	699	__m256d acc4=_mm256_load_pd(x + 4*4);
	700	__m256d acc5=_mm256_load_pd(x + 5*4);
	701	__m256d acc6=_mm256_load_pd(x + 6*4);
	702	__m256d acc7=_mm256_load_pd(x + 7*4);
	703
	704	long i = 0;
	705	for (; i <= n-4; i +=4) {
	706
	707	// the following code sequences are a bit faster than
	708	// just doing 4 _mm256_broadcast_sd's
	709	// it requires a to point to aligned storage, however
	710
	711	#if 1
	712	// this one seems slightly faster
	713	__m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
	714	__m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
	715	#else
	716	__m256d avec = _mm256_load_pd(a);
	717	__m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
	718	__m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
	719
	720	#endif
	721
	722	__m256d avec0 = _mm256_permute_pd(a0101, 0);
	723	__m256d avec1 = _mm256_permute_pd(a0101, 0xf);
	724	__m256d avec2 = _mm256_permute_pd(a2323, 0);
	725	__m256d avec3 = _mm256_permute_pd(a2323, 0xf);
	726
	727	a += 4;
	728
	729	__m256d bvec;
	730
	731	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec0, bvec);
	732	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec0, bvec);
	733	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec0, bvec);
	734	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec0, bvec);
	735	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec0, bvec);
	736	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec0, bvec);
	737	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec0, bvec);
	738	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec0, bvec);
	739
	740	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec1, bvec);
	741	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec1, bvec);
	742	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec1, bvec);
	743	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec1, bvec);
	744	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec1, bvec);
	745	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec1, bvec);
	746	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec1, bvec);
	747	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec1, bvec);
	748
	749	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec2, bvec);
	750	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec2, bvec);
	751	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec2, bvec);
	752	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec2, bvec);
	753	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec2, bvec);
	754	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec2, bvec);
	755	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec2, bvec);
	756	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec2, bvec);
	757
	758	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec3, bvec);
	759	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec3, bvec);
	760	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec3, bvec);
	761	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec3, bvec);
	762	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec3, bvec);
	763	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec3, bvec);
	764	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec3, bvec);
	765	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec3, bvec);
	766	}
	767
	768	for (; i < n; i++) {
	769	__m256d avec = _mm256_broadcast_sd(a); a++;
	770	__m256d bvec;
	771
	772	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
	773	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
	774	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
	775	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
	776	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
	777	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
	778	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
	779	bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
	780	}
	781
	782
	783	_mm256_store_pd(x + 0*4, acc0);
	784	_mm256_store_pd(x + 1*4, acc1);
	785	_mm256_store_pd(x + 2*4, acc2);
	786	_mm256_store_pd(x + 3*4, acc3);
	787	_mm256_store_pd(x + 4*4, acc4);
	788	_mm256_store_pd(x + 5*4, acc5);
	789	_mm256_store_pd(x + 6*4, acc6);
	790	_mm256_store_pd(x + 7*4, acc7);
	791	}
	792
	793	#endif
	794
	795	// experiment: process two rows at a time
	796	#if 1
	797	static
	798	void muladd2_by_32(double x, const double a, const double *b, long n)
	799	{
	800	__m256d avec0, avec1, bvec;
	801	__m256d acc00, acc01, acc02, acc03;
	802	__m256d acc10, acc11, acc12, acc13;
	803
	804
	805	// round 0
	806
	807	acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
	808	acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
	809	acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
	810	acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
	811
	812	acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
	813	acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
	814	acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
	815	acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
	816
	817	for (long i = 0; i < n; i++) {
	818	avec0 = _mm256_broadcast_sd(&a[i]);
	819	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	820
	821	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
	822	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
	823	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
	824	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
	825	}
	826
	827
	828	_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
	829	_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
	830	_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
	831	_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
	832
	833	_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
	834	_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
	835	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
	836	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
	837
	838	// round 1
	839
	840	acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
	841	acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
	842	acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
	843	acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
	844
	845	acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
	846	acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
	847	acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
	848	acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
	849
	850	for (long i = 0; i < n; i++) {
	851	avec0 = _mm256_broadcast_sd(&a[i]);
	852	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	853
	854	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
	855	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
	856	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
	857	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
	858	}
	859
	860
	861	_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
	862	_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
	863	_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
	864	_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
	865
	866	_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
	867	_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
	868	_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
	869	_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
	870
	871	}
	872
	873	#else
	874
	875	static
	876	void muladd2_by_32(double x, const double a, const double *b, long n)
	877	{
	878	long i, j;
	879	__m256d bvec;
	880	__m256d acc00, acc01, acc02, acc03;
	881	__m256d acc10, acc11, acc12, acc13;
	882
	883
	884	for (j = 0; j < 2; j++) {
	885
	886	acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	887	acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	888	acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	889	acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	890
	891	acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	892	acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	893	acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	894	acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
	895
	896	for (i = 0; i <= n-4; i+=4) {
	897	__m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0));
	898	__m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2));
	899	__m256d avec00 = _mm256_permute_pd(a0_0101, 0);
	900	__m256d avec01 = _mm256_permute_pd(a0_0101, 0xf);
	901	__m256d avec02 = _mm256_permute_pd(a0_2323, 0);
	902	__m256d avec03 = _mm256_permute_pd(a0_2323, 0xf);
	903
	904	__m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ));
	905	__m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ));
	906	__m256d avec10 = _mm256_permute_pd(a1_0101, 0);
	907	__m256d avec11 = _mm256_permute_pd(a1_0101, 0xf);
	908	__m256d avec12 = _mm256_permute_pd(a1_2323, 0);
	909	__m256d avec13 = _mm256_permute_pd(a1_2323, 0xf);
	910
	911	bvec = _mm256_load_pd(&b[(i+0)MAT_BLK_SZ+04+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec);
	912	bvec = _mm256_load_pd(&b[(i+0)MAT_BLK_SZ+14+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec);
	913	bvec = _mm256_load_pd(&b[(i+0)MAT_BLK_SZ+24+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec);
	914	bvec = _mm256_load_pd(&b[(i+0)MAT_BLK_SZ+34+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec);
	915
	916	bvec = _mm256_load_pd(&b[(i+1)MAT_BLK_SZ+04+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec);
	917	bvec = _mm256_load_pd(&b[(i+1)MAT_BLK_SZ+14+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec);
	918	bvec = _mm256_load_pd(&b[(i+1)MAT_BLK_SZ+24+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec);
	919	bvec = _mm256_load_pd(&b[(i+1)MAT_BLK_SZ+34+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec);
	920
	921	bvec = _mm256_load_pd(&b[(i+2)MAT_BLK_SZ+04+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec);
	922	bvec = _mm256_load_pd(&b[(i+2)MAT_BLK_SZ+14+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec);
	923	bvec = _mm256_load_pd(&b[(i+2)MAT_BLK_SZ+24+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec);
	924	bvec = _mm256_load_pd(&b[(i+2)MAT_BLK_SZ+34+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec);
	925
	926	bvec = _mm256_load_pd(&b[(i+3)MAT_BLK_SZ+04+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec);
	927	bvec = _mm256_load_pd(&b[(i+3)MAT_BLK_SZ+14+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec);
	928	bvec = _mm256_load_pd(&b[(i+3)MAT_BLK_SZ+24+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec);
	929	bvec = _mm256_load_pd(&b[(i+3)MAT_BLK_SZ+34+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec);
	930	}
	931
	932	for (; i < n; i++) {
	933	__m256d avec0 = _mm256_broadcast_sd(&a[i]);
	934	__m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	935
	936	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
	937	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
	938	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
	939	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
	940	}
	941
	942
	943	_mm256_store_pd(x + 04 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00);
	944	_mm256_store_pd(x + 14 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01);
	945	_mm256_store_pd(x + 24 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02);
	946	_mm256_store_pd(x + 34 + 0MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03);
	947
	948	_mm256_store_pd(x + 04 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10);
	949	_mm256_store_pd(x + 14 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11);
	950	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12);
	951	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13);
	952
	953	}
	954	}
	955	#endif
	956
	957
	958
	959	// experiment: process three rows at a time
	960	// NOTE: this makes things slower on an AVX1 platform --- not enough registers
	961	// it could be faster on AVX2/FMA, where there should be enough registers
	962
	963	static
	964	void muladd3_by_32(double x, const double a, const double *b, long n)
	965	{
	966	__m256d avec0, avec1, avec2, bvec;
	967	__m256d acc00, acc01, acc02, acc03;
	968	__m256d acc10, acc11, acc12, acc13;
	969	__m256d acc20, acc21, acc22, acc23;
	970
	971
	972	// round 0
	973
	974	acc00=_mm256_load_pd(x + 04 + 0MAT_BLK_SZ);
	975	acc01=_mm256_load_pd(x + 14 + 0MAT_BLK_SZ);
	976	acc02=_mm256_load_pd(x + 24 + 0MAT_BLK_SZ);
	977	acc03=_mm256_load_pd(x + 34 + 0MAT_BLK_SZ);
	978
	979	acc10=_mm256_load_pd(x + 04 + 1MAT_BLK_SZ);
	980	acc11=_mm256_load_pd(x + 14 + 1MAT_BLK_SZ);
	981	acc12=_mm256_load_pd(x + 24 + 1MAT_BLK_SZ);
	982	acc13=_mm256_load_pd(x + 34 + 1MAT_BLK_SZ);
	983
	984	acc20=_mm256_load_pd(x + 04 + 2MAT_BLK_SZ);
	985	acc21=_mm256_load_pd(x + 14 + 2MAT_BLK_SZ);
	986	acc22=_mm256_load_pd(x + 24 + 2MAT_BLK_SZ);
	987	acc23=_mm256_load_pd(x + 34 + 2MAT_BLK_SZ);
	988
	989	for (long i = 0; i < n; i++) {
	990	avec0 = _mm256_broadcast_sd(&a[i]);
	991	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	992	avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
	993
	994	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
	995	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
	996	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
	997	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
	998	}
	999
	1000
	1001	_mm256_store_pd(x + 04 + 0MAT_BLK_SZ, acc00);
	1002	_mm256_store_pd(x + 14 + 0MAT_BLK_SZ, acc01);
	1003	_mm256_store_pd(x + 24 + 0MAT_BLK_SZ, acc02);
	1004	_mm256_store_pd(x + 34 + 0MAT_BLK_SZ, acc03);
	1005
	1006	_mm256_store_pd(x + 04 + 1MAT_BLK_SZ, acc10);
	1007	_mm256_store_pd(x + 14 + 1MAT_BLK_SZ, acc11);
	1008	_mm256_store_pd(x + 24 + 1MAT_BLK_SZ, acc12);
	1009	_mm256_store_pd(x + 34 + 1MAT_BLK_SZ, acc13);
	1010
	1011	_mm256_store_pd(x + 04 + 2MAT_BLK_SZ, acc20);
	1012	_mm256_store_pd(x + 14 + 2MAT_BLK_SZ, acc21);
	1013	_mm256_store_pd(x + 24 + 2MAT_BLK_SZ, acc22);
	1014	_mm256_store_pd(x + 34 + 2MAT_BLK_SZ, acc23);
	1015
	1016	// round 1
	1017
	1018	acc00=_mm256_load_pd(x + 44 + 0MAT_BLK_SZ);
	1019	acc01=_mm256_load_pd(x + 54 + 0MAT_BLK_SZ);
	1020	acc02=_mm256_load_pd(x + 64 + 0MAT_BLK_SZ);
	1021	acc03=_mm256_load_pd(x + 74 + 0MAT_BLK_SZ);
	1022
	1023	acc10=_mm256_load_pd(x + 44 + 1MAT_BLK_SZ);
	1024	acc11=_mm256_load_pd(x + 54 + 1MAT_BLK_SZ);
	1025	acc12=_mm256_load_pd(x + 64 + 1MAT_BLK_SZ);
	1026	acc13=_mm256_load_pd(x + 74 + 1MAT_BLK_SZ);
	1027
	1028	acc20=_mm256_load_pd(x + 44 + 2MAT_BLK_SZ);
	1029	acc21=_mm256_load_pd(x + 54 + 2MAT_BLK_SZ);
	1030	acc22=_mm256_load_pd(x + 64 + 2MAT_BLK_SZ);
	1031	acc23=_mm256_load_pd(x + 74 + 2MAT_BLK_SZ);
	1032
	1033	for (long i = 0; i < n; i++) {
	1034	avec0 = _mm256_broadcast_sd(&a[i]);
	1035	avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
	1036	avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
	1037
	1038	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+04+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
	1039	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+14+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
	1040	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+24+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
	1041	bvec = _mm256_load_pd(&b[iMAT_BLK_SZ+34+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
	1042	}
	1043
	1044
	1045	_mm256_store_pd(x + 44 + 0MAT_BLK_SZ, acc00);
	1046	_mm256_store_pd(x + 54 + 0MAT_BLK_SZ, acc01);
	1047	_mm256_store_pd(x + 64 + 0MAT_BLK_SZ, acc02);
	1048	_mm256_store_pd(x + 74 + 0MAT_BLK_SZ, acc03);
	1049
	1050	_mm256_store_pd(x + 44 + 1MAT_BLK_SZ, acc10);
	1051	_mm256_store_pd(x + 54 + 1MAT_BLK_SZ, acc11);
	1052	_mm256_store_pd(x + 64 + 1MAT_BLK_SZ, acc12);
	1053	_mm256_store_pd(x + 74 + 1MAT_BLK_SZ, acc13);
	1054
	1055	_mm256_store_pd(x + 44 + 2MAT_BLK_SZ, acc20);
	1056	_mm256_store_pd(x + 54 + 2MAT_BLK_SZ, acc21);
	1057	_mm256_store_pd(x + 64 + 2MAT_BLK_SZ, acc22);
	1058	_mm256_store_pd(x + 74 + 2MAT_BLK_SZ, acc23);
	1059
	1060	}
	1061
	1062	static inline
	1063	void muladd_all_by_32(long first, long last, double x, const double a, const double *b, long n)
	1064	{
	1065	long i = first;
	1066	#ifdef NTL_HAVE_FMA
	1067	// processing three rows at a time is faster
	1068	for (; i <= last-3; i+=3)
	1069	muladd3_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1070	for (; i < last; i++)
	1071	muladd1_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1072	#else
	1073	// process only two rows at a time: not enough registers :-(
	1074	for (; i <= last-2; i+=2)
	1075	muladd2_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1076	for (; i < last; i++)
	1077	muladd1_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1078	#endif
	1079	}
	1080
	1081
	1082	// this assumes n is a multiple of 16
	1083	static inline
	1084	void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
	1085	{
	1086	__m256d xvec0, xvec1, xvec2, xvec3;
	1087	__m256d yvec0, yvec1, yvec2, yvec3;
	1088
	1089	__m256d cvec = _mm256_broadcast_sd(&c);
	1090
	1091	for (long i = 0; i < n; i += 16, x += 16, y += 16) {
	1092	xvec0 = _mm256_load_pd(x+0*4);
	1093	xvec1 = _mm256_load_pd(x+1*4);
	1094	xvec2 = _mm256_load_pd(x+2*4);
	1095	xvec3 = _mm256_load_pd(x+3*4);
	1096
	1097	yvec0 = _mm256_load_pd(y+0*4);
	1098	yvec1 = _mm256_load_pd(y+1*4);
	1099	yvec2 = _mm256_load_pd(y+2*4);
	1100	yvec3 = _mm256_load_pd(y+3*4);
	1101
	1102	MUL_ADD(xvec0, yvec0, cvec);
	1103	MUL_ADD(xvec1, yvec1, cvec);
	1104	MUL_ADD(xvec2, yvec2, cvec);
	1105	MUL_ADD(xvec3, yvec3, cvec);
	1106
	1107	_mm256_store_pd(x + 0*4, xvec0);
	1108	_mm256_store_pd(x + 1*4, xvec1);
	1109	_mm256_store_pd(x + 2*4, xvec2);
	1110	_mm256_store_pd(x + 3*4, xvec3);
	1111	}
	1112	}
	1113
	1114	// this one is more general: does not assume that n is a
	1115	// multiple of 16
	1116	static inline
	1117	void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
	1118	{
	1119
	1120	__m256d xvec0, xvec1, xvec2, xvec3;
	1121	__m256d yvec0, yvec1, yvec2, yvec3;
	1122	__m256d cvec;
	1123
	1124	if (n >= 4)
	1125	cvec = _mm256_broadcast_sd(&c);
	1126
	1127	long i=0;
	1128	for (; i <= n-16; i += 16, x += 16, y += 16) {
	1129	xvec0 = _mm256_load_pd(x+0*4);
	1130	xvec1 = _mm256_load_pd(x+1*4);
	1131	xvec2 = _mm256_load_pd(x+2*4);
	1132	xvec3 = _mm256_load_pd(x+3*4);
	1133
	1134	yvec0 = _mm256_load_pd(y+0*4);
	1135	yvec1 = _mm256_load_pd(y+1*4);
	1136	yvec2 = _mm256_load_pd(y+2*4);
	1137	yvec3 = _mm256_load_pd(y+3*4);
	1138
	1139	MUL_ADD(xvec0, yvec0, cvec);
	1140	MUL_ADD(xvec1, yvec1, cvec);
	1141	MUL_ADD(xvec2, yvec2, cvec);
	1142	MUL_ADD(xvec3, yvec3, cvec);
	1143
	1144	_mm256_store_pd(x + 0*4, xvec0);
	1145	_mm256_store_pd(x + 1*4, xvec1);
	1146	_mm256_store_pd(x + 2*4, xvec2);
	1147	_mm256_store_pd(x + 3*4, xvec3);
	1148	}
	1149
	1150	for (; i <= n-4; i += 4, x += 4, y += 4) {
	1151	xvec0 = _mm256_load_pd(x+0*4);
	1152	yvec0 = _mm256_load_pd(y+0*4);
	1153	MUL_ADD(xvec0, yvec0, cvec);
	1154	_mm256_store_pd(x + 0*4, xvec0);
	1155	}
	1156
	1157	for (; i < n; i++, x++, y++) {
	1158	x += (y)*c;
	1159	}
	1160	}
	1161
	1162	#define AVX_PD_SZ (4)
	1163
	1164	// experimental: assumes n is a multiple of 4 in the range [0..32]
	1165	#if 1
	1166	static inline
	1167	void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
	1168	{
	1169	n /= 4;
	1170	if (n <= 0 \|\| n > 8) return;
	1171
	1172	x += n*4;
	1173	y += n*4;
	1174
	1175	// n in [1..8]
	1176
	1177	__m256d xvec, yvec, cvec;
	1178
	1179	cvec = _mm256_broadcast_sd(&c);
	1180
	1181	switch (n) {
	1182	case 8: xvec = _mm256_load_pd(x-84); yvec = _mm256_load_pd(y-84); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
	1183	case 7: xvec = _mm256_load_pd(x-74); yvec = _mm256_load_pd(y-74); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
	1184	case 6: xvec = _mm256_load_pd(x-64); yvec = _mm256_load_pd(y-64); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
	1185	case 5: xvec = _mm256_load_pd(x-54); yvec = _mm256_load_pd(y-54); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
	1186	case 4: xvec = _mm256_load_pd(x-44); yvec = _mm256_load_pd(y-44); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
	1187	case 3: xvec = _mm256_load_pd(x-34); yvec = _mm256_load_pd(y-34); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
	1188	case 2: xvec = _mm256_load_pd(x-24); yvec = _mm256_load_pd(y-24); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
	1189	case 1: xvec = _mm256_load_pd(x-14); yvec = _mm256_load_pd(y-14); MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
	1190	}
	1191
	1192	}
	1193	#else
	1194	static inline
	1195	void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
	1196	{
	1197	for (long i = 0; i < n; i++)
	1198	x[i] += y[i]*c;
	1199	}
	1200	#endif
	1201
	1202	#endif
	1203
	1204
	1205	#define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
	1206	//#define DO_MUL(a, b) ((a)*(b))
	1207
	1208	static
	1209	inline void muladd_interval(unsigned long * NTL_RESTRICT x, unsigned long * NTL_RESTRICT y,
	1210	unsigned long c, long n)
	1211	{
	1212	for (long i = 0; i < n; i++)
	1213	x[i] += DO_MUL(y[i], c);
	1214	}
	1215
	1216	static
	1217	void muladd1_by_32(unsigned long x, const unsigned long a, const unsigned long *b,
	1218	long n)
	1219	{
	1220	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1221	unsigned long sum = x[j];
	1222	long i = 0;
	1223
	1224	for (; i <= n-4; i += 4) {
	1225	sum += DO_MUL(a[i+0], b[i+0]);
	1226	sum += DO_MUL(a[i+1], b[i+1]);
	1227	sum += DO_MUL(a[i+2], b[i+2]);
	1228	sum += DO_MUL(a[i+3], b[i+3]);
	1229	}
	1230
	1231	for (; i < n; i++)
	1232	sum += DO_MUL(a[i], b[i]);
	1233
	1234	x[j] = sum;
	1235	b += MAT_BLK_SZ;
	1236	}
	1237	}
	1238
	1239	// experiment with shorter int's
	1240	static
	1241	void muladd1_by_32(unsigned long x, const unsigned int a, const unsigned int *b,
	1242	long n)
	1243	{
	1244	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1245	unsigned long sum = x[j];
	1246	long i = 0;
	1247
	1248	for (; i <= n-4; i += 4) {
	1249	sum += DO_MUL(a[i+0], b[i+0]);
	1250	sum += DO_MUL(a[i+1], b[i+1]);
	1251	sum += DO_MUL(a[i+2], b[i+2]);
	1252	sum += DO_MUL(a[i+3], b[i+3]);
	1253	}
	1254
	1255	for (; i < n; i++)
	1256	sum += DO_MUL(a[i], b[i]);
	1257
	1258	x[j] = sum;
	1259	b += MAT_BLK_SZ;
	1260	}
	1261	}
	1262
	1263	#if 0
	1264	static
	1265	void muladd1_by_32_full(unsigned long x, const unsigned long a, const unsigned long *b)
	1266	{
	1267	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1268	unsigned long sum = x[j];
	1269	long i = 0;
	1270
	1271	sum += DO_MUL(a[i+0], b[i+0]);
	1272	sum += DO_MUL(a[i+1], b[i+1]);
	1273	sum += DO_MUL(a[i+2], b[i+2]);
	1274	sum += DO_MUL(a[i+3], b[i+3]);
	1275	sum += DO_MUL(a[i+4], b[i+4]);
	1276	sum += DO_MUL(a[i+5], b[i+5]);
	1277	sum += DO_MUL(a[i+6], b[i+6]);
	1278	sum += DO_MUL(a[i+7], b[i+7]);
	1279	sum += DO_MUL(a[i+8], b[i+8]);
	1280	sum += DO_MUL(a[i+9], b[i+9]);
	1281	sum += DO_MUL(a[i+10], b[i+10]);
	1282	sum += DO_MUL(a[i+11], b[i+11]);
	1283	sum += DO_MUL(a[i+12], b[i+12]);
	1284	sum += DO_MUL(a[i+13], b[i+13]);
	1285	sum += DO_MUL(a[i+14], b[i+14]);
	1286	sum += DO_MUL(a[i+15], b[i+15]);
	1287	sum += DO_MUL(a[i+16], b[i+16]);
	1288	sum += DO_MUL(a[i+17], b[i+17]);
	1289	sum += DO_MUL(a[i+18], b[i+18]);
	1290	sum += DO_MUL(a[i+19], b[i+19]);
	1291	sum += DO_MUL(a[i+20], b[i+20]);
	1292	sum += DO_MUL(a[i+21], b[i+21]);
	1293	sum += DO_MUL(a[i+22], b[i+22]);
	1294	sum += DO_MUL(a[i+23], b[i+23]);
	1295	sum += DO_MUL(a[i+24], b[i+24]);
	1296	sum += DO_MUL(a[i+25], b[i+25]);
	1297	sum += DO_MUL(a[i+26], b[i+26]);
	1298	sum += DO_MUL(a[i+27], b[i+27]);
	1299	sum += DO_MUL(a[i+28], b[i+28]);
	1300	sum += DO_MUL(a[i+29], b[i+29]);
	1301	sum += DO_MUL(a[i+30], b[i+30]);
	1302	sum += DO_MUL(a[i+31], b[i+31]);
	1303
	1304	x[j] = sum;
	1305	b += MAT_BLK_SZ;
	1306	}
	1307	}
	1308	#else
	1309
	1310	// this version is faster (by about 25%) on a Sandybridge machine
	1311
	1312	#define ONE_STEP_L(i) \
	1313	sum += DO_MUL(a[i],b[i]);\
	1314	sum_1 += DO_MUL(a[i],b_1[i]);\
	1315	sum_2 += DO_MUL(a[i],b_2[i]);\
	1316	sum_3 += DO_MUL(a[i],b_3[i])\
	1317
	1318
	1319	static
	1320	void muladd1_by_32_full(unsigned long x, const unsigned long a, const unsigned long *b)
	1321	{
	1322	for (long j = 0; j < MAT_BLK_SZ; j+=4) {
	1323
	1324	unsigned long sum = x[j];
	1325	unsigned long sum_1 = x[j+1];
	1326	unsigned long sum_2 = x[j+2];
	1327	unsigned long sum_3 = x[j+3];
	1328
	1329	const unsigned long *b_1 = b+MAT_BLK_SZ;
	1330	const unsigned long b_2 = b+2MAT_BLK_SZ;
	1331	const unsigned long b_3 = b+3MAT_BLK_SZ;
	1332
	1333	ONE_STEP_L(0);
	1334	ONE_STEP_L(1);
	1335	ONE_STEP_L(2);
	1336	ONE_STEP_L(3);
	1337	ONE_STEP_L(4);
	1338	ONE_STEP_L(5);
	1339	ONE_STEP_L(6);
	1340	ONE_STEP_L(7);
	1341	ONE_STEP_L(8);
	1342	ONE_STEP_L(9);
	1343	ONE_STEP_L(10);
	1344	ONE_STEP_L(11);
	1345	ONE_STEP_L(12);
	1346	ONE_STEP_L(13);
	1347	ONE_STEP_L(14);
	1348	ONE_STEP_L(15);
	1349	ONE_STEP_L(16);
	1350	ONE_STEP_L(17);
	1351	ONE_STEP_L(18);
	1352	ONE_STEP_L(19);
	1353	ONE_STEP_L(20);
	1354	ONE_STEP_L(21);
	1355	ONE_STEP_L(22);
	1356	ONE_STEP_L(23);
	1357	ONE_STEP_L(24);
	1358	ONE_STEP_L(25);
	1359	ONE_STEP_L(26);
	1360	ONE_STEP_L(27);
	1361	ONE_STEP_L(28);
	1362	ONE_STEP_L(29);
	1363	ONE_STEP_L(30);
	1364	ONE_STEP_L(31);
	1365
	1366	x[j] = sum;
	1367	x[j+1] = sum_1;
	1368	x[j+2] = sum_2;
	1369	x[j+3] = sum_3;
	1370
	1371	b += 4*MAT_BLK_SZ;
	1372	}
	1373	}
	1374
	1375	// experiment with shorter int's
	1376	static
	1377	void muladd1_by_32_full(unsigned long x, const unsigned int a, const unsigned int *b)
	1378	{
	1379	for (long j = 0; j < MAT_BLK_SZ; j+=4) {
	1380
	1381	unsigned long sum = x[j];
	1382	unsigned long sum_1 = x[j+1];
	1383	unsigned long sum_2 = x[j+2];
	1384	unsigned long sum_3 = x[j+3];
	1385
	1386	const unsigned int *b_1 = b+MAT_BLK_SZ;
	1387	const unsigned int b_2 = b+2MAT_BLK_SZ;
	1388	const unsigned int b_3 = b+3MAT_BLK_SZ;
	1389
	1390	ONE_STEP_L(0);
	1391	ONE_STEP_L(1);
	1392	ONE_STEP_L(2);
	1393	ONE_STEP_L(3);
	1394	ONE_STEP_L(4);
	1395	ONE_STEP_L(5);
	1396	ONE_STEP_L(6);
	1397	ONE_STEP_L(7);
	1398	ONE_STEP_L(8);
	1399	ONE_STEP_L(9);
	1400	ONE_STEP_L(10);
	1401	ONE_STEP_L(11);
	1402	ONE_STEP_L(12);
	1403	ONE_STEP_L(13);
	1404	ONE_STEP_L(14);
	1405	ONE_STEP_L(15);
	1406	ONE_STEP_L(16);
	1407	ONE_STEP_L(17);
	1408	ONE_STEP_L(18);
	1409	ONE_STEP_L(19);
	1410	ONE_STEP_L(20);
	1411	ONE_STEP_L(21);
	1412	ONE_STEP_L(22);
	1413	ONE_STEP_L(23);
	1414	ONE_STEP_L(24);
	1415	ONE_STEP_L(25);
	1416	ONE_STEP_L(26);
	1417	ONE_STEP_L(27);
	1418	ONE_STEP_L(28);
	1419	ONE_STEP_L(29);
	1420	ONE_STEP_L(30);
	1421	ONE_STEP_L(31);
	1422
	1423	x[j] = sum;
	1424	x[j+1] = sum_1;
	1425	x[j+2] = sum_2;
	1426	x[j+3] = sum_3;
	1427
	1428	b += 4*MAT_BLK_SZ;
	1429	}
	1430	}
	1431
	1432	#endif
	1433
	1434	static inline
	1435	void muladd_all_by_32(long first, long last, unsigned long x, const unsigned int a, const unsigned int *b, long n)
	1436	{
	1437	if (n == MAT_BLK_SZ) {
	1438	for (long i = first; i < last; i++)
	1439	muladd1_by_32_full(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b);
	1440	}
	1441	else {
	1442	for (long i = first; i < last; i++)
	1443	muladd1_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1444	}
	1445	}
	1446
	1447	static inline
	1448	void muladd_all_by_32(long first, long last, unsigned long x, const unsigned long a, const unsigned long *b, long n)
	1449	{
	1450	if (n == MAT_BLK_SZ) {
	1451	for (long i = first; i < last; i++)
	1452	muladd1_by_32_full(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b);
	1453	}
	1454	else {
	1455	for (long i = first; i < last; i++)
	1456	muladd1_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n);
	1457	}
	1458	}
	1459
	1460	#if (NTL_BITS_PER_INT >= NTL_BITS_PER_LONG/2)
	1461
	1462	typedef unsigned int uhlong;
	1463
	1464	#else
	1465
	1466	typedef unsigned long uhlong;
	1467
	1468	#endif
	1469
	1470
	1471
	1472
	1473	// NOTE: the following code is hardcoded for MAT_BLK_SZ == 32.
	1474	// Also, we special case NTL_BITS_PER_LONG-NTL_SP_NBITS > 2, which
	1475	// allows us to accumulate all 32 products without additional carries.
	1476
	1477	#if (NTL_BITS_PER_LONG-NTL_SP_NBITS > 2)
	1478
	1479	static
	1480	void muladd1_by_32(long x, const long a, const long *b,
	1481	long n, long p, sp_ll_reduce_struct ll_red_struct)
	1482	{
	1483	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1484
	1485	ll_type sum;
	1486	ll_init(sum, x[j]);
	1487	#if 0
	1488	for (long i = 0; i < n; i++)
	1489	ll_imul_add(sum, a[i], b[i]);
	1490	#else
	1491	long i=0;
	1492	for(; i <= n-8; i+= 8) {
	1493	ll_imul_add(sum, a[i+0], b[i+0]);
	1494	ll_imul_add(sum, a[i+1], b[i+1]);
	1495	ll_imul_add(sum, a[i+2], b[i+2]);
	1496	ll_imul_add(sum, a[i+3], b[i+3]);
	1497
	1498	ll_imul_add(sum, a[i+4], b[i+4]);
	1499	ll_imul_add(sum, a[i+5], b[i+5]);
	1500	ll_imul_add(sum, a[i+6], b[i+6]);
	1501	ll_imul_add(sum, a[i+7], b[i+7]);
	1502	}
	1503
	1504	for (; i < n; i++)
	1505	ll_imul_add(sum, a[i], b[i]);
	1506
	1507	#endif
	1508
	1509	unsigned long sum0 = ll_get_lo(sum);
	1510	unsigned long sum1 = ll_get_hi(sum);
	1511
	1512	long res;
	1513
	1514	if (ll_red_struct.nbits == NTL_SP_NBITS)
	1515	res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
	1516	else
	1517	res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
	1518
	1519
	1520	x[j] = res;
	1521	b += MAT_BLK_SZ;
	1522	}
	1523	}
	1524
	1525	#if 0
	1526	static
	1527	void muladd1_by_32_full(long x, const long a, const long *b,
	1528	long p, sp_ll_reduce_struct ll_red_struct)
	1529	{
	1530	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1531
	1532	ll_type sum;
	1533	ll_init(sum, x[j]);
	1534
	1535	ll_imul_add(sum, a[0], b[0]);
	1536	ll_imul_add(sum, a[1], b[1]);
	1537	ll_imul_add(sum, a[2], b[2]);
	1538	ll_imul_add(sum, a[3], b[3]);
	1539	ll_imul_add(sum, a[4], b[4]);
	1540	ll_imul_add(sum, a[5], b[5]);
	1541	ll_imul_add(sum, a[6], b[6]);
	1542	ll_imul_add(sum, a[7], b[7]);
	1543	ll_imul_add(sum, a[8], b[8]);
	1544	ll_imul_add(sum, a[9], b[9]);
	1545	ll_imul_add(sum, a[10], b[10]);
	1546	ll_imul_add(sum, a[11], b[11]);
	1547	ll_imul_add(sum, a[12], b[12]);
	1548	ll_imul_add(sum, a[13], b[13]);
	1549	ll_imul_add(sum, a[14], b[14]);
	1550	ll_imul_add(sum, a[15], b[15]);
	1551	ll_imul_add(sum, a[16], b[16]);
	1552	ll_imul_add(sum, a[17], b[17]);
	1553	ll_imul_add(sum, a[18], b[18]);
	1554	ll_imul_add(sum, a[19], b[19]);
	1555	ll_imul_add(sum, a[20], b[20]);
	1556	ll_imul_add(sum, a[21], b[21]);
	1557	ll_imul_add(sum, a[22], b[22]);
	1558	ll_imul_add(sum, a[23], b[23]);
	1559	ll_imul_add(sum, a[24], b[24]);
	1560	ll_imul_add(sum, a[25], b[25]);
	1561	ll_imul_add(sum, a[26], b[26]);
	1562	ll_imul_add(sum, a[27], b[27]);
	1563	ll_imul_add(sum, a[28], b[28]);
	1564	ll_imul_add(sum, a[29], b[29]);
	1565	ll_imul_add(sum, a[30], b[30]);
	1566	ll_imul_add(sum, a[31], b[31]);
	1567
	1568	unsigned long sum0 = ll_get_lo(sum);
	1569	unsigned long sum1 = ll_get_hi(sum);
	1570
	1571	long res;
	1572
	1573	if (ll_red_struct.nbits == NTL_SP_NBITS)
	1574	res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
	1575	else
	1576	res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
	1577
	1578
	1579	x[j] = res;
	1580	b += MAT_BLK_SZ;
	1581	}
	1582	}
	1583
	1584	#elif 1
	1585	// This version is consistently fastest on tests on Sandybridge and Haswell
	1586
	1587
	1588
	1589	#define ONE_STEP(i) \
	1590	ll_imul_add(sum, a[i], b[i]);\
	1591	ll_imul_add(sum_1, a[i], b_1[i]);\
	1592	ll_imul_add(sum_2, a[i], b_2[i]);\
	1593	ll_imul_add(sum_3, a[i], b_3[i]);\
	1594
	1595
	1596	void muladd1_by_32_full(long x, const long a, const long *b,
	1597	long p, sp_ll_reduce_struct ll_red_struct)
	1598	{
	1599	for (long j = 0; j < MAT_BLK_SZ; j+=4) {
	1600
	1601	ll_type sum, sum_1, sum_2, sum_3;
	1602	ll_init(sum, x[j]);
	1603	ll_init(sum_1, x[j+1]);
	1604	ll_init(sum_2, x[j+2]);
	1605	ll_init(sum_3, x[j+3]);
	1606
	1607	const long *b_1 = b+MAT_BLK_SZ;
	1608	const long b_2 = b+2MAT_BLK_SZ;
	1609	const long b_3 = b+3MAT_BLK_SZ;
	1610
	1611	ONE_STEP(0);
	1612	ONE_STEP(1);
	1613	ONE_STEP(2);
	1614	ONE_STEP(3);
	1615	ONE_STEP(4);
	1616	ONE_STEP(5);
	1617	ONE_STEP(6);
	1618	ONE_STEP(7);
	1619	ONE_STEP(8);
	1620	ONE_STEP(9);
	1621	ONE_STEP(10);
	1622	ONE_STEP(11);
	1623	ONE_STEP(12);
	1624	ONE_STEP(13);
	1625	ONE_STEP(14);
	1626	ONE_STEP(15);
	1627	ONE_STEP(16);
	1628	ONE_STEP(17);
	1629	ONE_STEP(18);
	1630	ONE_STEP(19);
	1631	ONE_STEP(20);
	1632	ONE_STEP(21);
	1633	ONE_STEP(22);
	1634	ONE_STEP(23);
	1635	ONE_STEP(24);
	1636	ONE_STEP(25);
	1637	ONE_STEP(26);
	1638	ONE_STEP(27);
	1639	ONE_STEP(28);
	1640	ONE_STEP(29);
	1641	ONE_STEP(30);
	1642	ONE_STEP(31);
	1643
	1644	unsigned long sum0 = ll_get_lo(sum);
	1645	unsigned long sum1 = ll_get_hi(sum);
	1646
	1647	unsigned long sum0_1 = ll_get_lo(sum_1);
	1648	unsigned long sum1_1 = ll_get_hi(sum_1);
	1649
	1650	unsigned long sum0_2 = ll_get_lo(sum_2);
	1651	unsigned long sum1_2 = ll_get_hi(sum_2);
	1652
	1653	unsigned long sum0_3 = ll_get_lo(sum_3);
	1654	unsigned long sum1_3 = ll_get_hi(sum_3);
	1655
	1656	if (ll_red_struct.nbits == NTL_SP_NBITS) {
	1657	x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
	1658	x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
	1659	x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
	1660	x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
	1661	}
	1662	else {
	1663	x[j] = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
	1664	x[j+1] = sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
	1665	x[j+2] = sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
	1666	x[j+3] = sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
	1667	}
	1668
	1669
	1670	b += 4*MAT_BLK_SZ;
	1671	}
	1672	}
	1673
	1674
	1675	#endif
	1676
	1677	#else
	1678
	1679
	1680	static
	1681	void muladd1_by_32(long x, const long a, const long *b,
	1682	long n, long p, sp_ll_reduce_struct ll_red_struct)
	1683	{
	1684	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1685
	1686	ll_type sum;
	1687	ll_init(sum, x[j]);
	1688
	1689	long i = 0;
	1690	for (; i < n-16; i++)
	1691	ll_imul_add(sum, a[i], b[i]);
	1692
	1693	ll_type acc21;
	1694	ll_init(acc21, ll_get_hi(sum));
	1695	unsigned long acc0 = ll_get_lo(sum);
	1696	ll_init(sum, acc0);
	1697
	1698	for (; i < n; i++)
	1699	ll_imul_add(sum, a[i], b[i]);
	1700
	1701	acc0 = ll_get_lo(sum);
	1702	ll_add(acc21, ll_get_hi(sum));
	1703
	1704	long res;
	1705
	1706	if (ll_red_struct.nbits == NTL_SP_NBITS)
	1707	res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
	1708	else
	1709	res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
	1710
	1711	x[j] = res;
	1712	b += MAT_BLK_SZ;
	1713	}
	1714	}
	1715
	1716	static
	1717	void muladd1_by_32_full(long x, const long a, const long *b,
	1718	long p, sp_ll_reduce_struct ll_red_struct)
	1719	{
	1720	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1721
	1722	ll_type sum;
	1723	ll_init(sum, x[j]);
	1724
	1725	ll_imul_add(sum, a[0], b[0]);
	1726	ll_imul_add(sum, a[1], b[1]);
	1727	ll_imul_add(sum, a[2], b[2]);
	1728	ll_imul_add(sum, a[3], b[3]);
	1729	ll_imul_add(sum, a[4], b[4]);
	1730	ll_imul_add(sum, a[5], b[5]);
	1731	ll_imul_add(sum, a[6], b[6]);
	1732	ll_imul_add(sum, a[7], b[7]);
	1733	ll_imul_add(sum, a[8], b[8]);
	1734	ll_imul_add(sum, a[9], b[9]);
	1735	ll_imul_add(sum, a[10], b[10]);
	1736	ll_imul_add(sum, a[11], b[11]);
	1737	ll_imul_add(sum, a[12], b[12]);
	1738	ll_imul_add(sum, a[13], b[13]);
	1739	ll_imul_add(sum, a[14], b[14]);
	1740	ll_imul_add(sum, a[15], b[15]);
	1741
	1742	ll_type acc21;
	1743	ll_init(acc21, ll_get_hi(sum));
	1744	unsigned long acc0 = ll_get_lo(sum);
	1745	ll_init(sum, acc0);
	1746
	1747	ll_imul_add(sum, a[16], b[16]);
	1748	ll_imul_add(sum, a[17], b[17]);
	1749	ll_imul_add(sum, a[18], b[18]);
	1750	ll_imul_add(sum, a[19], b[19]);
	1751	ll_imul_add(sum, a[20], b[20]);
	1752	ll_imul_add(sum, a[21], b[21]);
	1753	ll_imul_add(sum, a[22], b[22]);
	1754	ll_imul_add(sum, a[23], b[23]);
	1755	ll_imul_add(sum, a[24], b[24]);
	1756	ll_imul_add(sum, a[25], b[25]);
	1757	ll_imul_add(sum, a[26], b[26]);
	1758	ll_imul_add(sum, a[27], b[27]);
	1759	ll_imul_add(sum, a[28], b[28]);
	1760	ll_imul_add(sum, a[29], b[29]);
	1761	ll_imul_add(sum, a[30], b[30]);
	1762	ll_imul_add(sum, a[31], b[31]);
	1763
	1764	acc0 = ll_get_lo(sum);
	1765	ll_add(acc21, ll_get_hi(sum));
	1766
	1767	long res;
	1768
	1769	if (ll_red_struct.nbits == NTL_SP_NBITS)
	1770	res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
	1771	else
	1772	res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
	1773
	1774	x[j] = res;
	1775	b += MAT_BLK_SZ;
	1776	}
	1777	}
	1778
	1779
	1780
	1781	#endif
	1782
	1783
	1784	static
	1785	void muladd1_by_32_half2(long x, const long a, const long *b,
	1786	long n, long p, sp_ll_reduce_struct ll_red_struct)
	1787	{
	1788	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1789
	1790	unsigned long sum[2];
	1791	sum[0] = x[j];
	1792	sum[1] = 0;
	1793
	1794	long k=0;
	1795	long i=0;
	1796	for(; i <= n-16; i+= 16) {
	1797	unsigned long lsum = a[i+0]*b[i+0];
	1798	lsum += a[i+1]*b[i+1];
	1799	lsum += a[i+2]*b[i+2];
	1800	lsum += a[i+3]*b[i+3];
	1801	lsum += a[i+4]*b[i+4];
	1802	lsum += a[i+5]*b[i+5];
	1803	lsum += a[i+6]*b[i+6];
	1804	lsum += a[i+7]*b[i+7];
	1805	lsum += a[i+8]*b[i+8];
	1806	lsum += a[i+9]*b[i+9];
	1807	lsum += a[i+10]*b[i+10];
	1808	lsum += a[i+11]*b[i+11];
	1809	lsum += a[i+12]*b[i+12];
	1810	lsum += a[i+13]*b[i+13];
	1811	lsum += a[i+14]*b[i+14];
	1812	lsum += a[i+15]*b[i+15];
	1813	sum[k++] += lsum;
	1814	}
	1815
	1816	if (i < n) {
	1817	unsigned long lsum = a[i]*b[i];
	1818	for (i++; i < n; i++)
	1819	lsum += a[i]*b[i];
	1820	sum[k++] += lsum;
	1821	}
	1822
	1823
	1824	long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
	1825	long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
	1826	x[j] = AddMod(t0, t1, p);
	1827
	1828	b += MAT_BLK_SZ;
	1829	}
	1830	}
	1831
	1832
	1833
	1834	// NOTE: oddly, this is slightly faster than the half2 routine, which
	1835	// I would have thought would be faster
	1836	// DIRT: this assumes MAT_BLK_SZ < (1L << NTL_BITS_PER_LONG/2),
	1837	// which will hold unconditionally for MAT_BLK_SZ < 2^16.
	1838	static
	1839	void muladd1_by_32_half1(long x, const long a, const long *b,
	1840	long n, long p, sp_ll_reduce_struct ll_red_struct)
	1841	{
	1842	for (long j = 0; j < MAT_BLK_SZ; j++) {
	1843
	1844	ll_type sum;
	1845	ll_init(sum, x[j]);
	1846
	1847	long i=0;
	1848	for(; i <= n-4; i+= 4) {
	1849	unsigned long lsum = a[i+0]*b[i+0];
	1850	lsum += a[i+1]*b[i+1];
	1851	lsum += a[i+2]*b[i+2];
	1852	lsum += a[i+3]*b[i+3];
	1853	ll_add(sum, lsum);
	1854	}
	1855
	1856	if (i < n) {
	1857	unsigned long lsum = a[i]*b[i];
	1858	for (i++; i < n; i++)
	1859	lsum += a[i]*b[i];
	1860	ll_add(sum, lsum);
	1861	}
	1862
	1863	unsigned long sum0 = ll_get_lo(sum);
	1864	unsigned long sum1 = ll_get_hi(sum);
	1865	x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
	1866
	1867	b += MAT_BLK_SZ;
	1868	}
	1869	}
	1870
	1871
	1872	static inline
	1873	void muladd_all_by_32(long first, long last, long x, const long a, const long *b, long n,
	1874	long p, sp_ll_reduce_struct ll_red_struct)
	1875	{
	1876	if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
	1877	if (n == MAT_BLK_SZ) {
	1878	for (long i = first; i < last; i++)
	1879	muladd1_by_32_full(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, p, ll_red_struct);
	1880	}
	1881	else {
	1882	for (long i = first; i < last; i++)
	1883	muladd1_by_32(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n, p, ll_red_struct);
	1884	}
	1885	}
	1886	else {
	1887	for (long i = first; i < last; i++)
	1888	muladd1_by_32_half1(x + iMAT_BLK_SZ, a + iMAT_BLK_SZ, b, n, p, ll_red_struct);
	1889	}
	1890	}
	1891
	1892
	1893
	1894	#endif
	1895
	1896
	1897
	1898	static
	1899	inline void muladd_interval(long * NTL_RESTRICT x, long * NTL_RESTRICT y,
	1900	long c, long n, long p, mulmod_t pinv)
	1901	{
	1902	mulmod_precon_t cpinv = PrepMulModPrecon(c, p, pinv);
	1903	for (long i = 0; i < n; i++) {
	1904	long t = MulModPrecon(y[i], c, p, cpinv);
	1905	x[i] = AddMod(x[i], t, p);
	1906	}
	1907	}
	1908
	1909
	1910	// ******************************************************************
	1911	//
	1912	// General matrix multiplication code
	1913	//
	1914	// ******************************************************************
	1915
	1916
	1917
	1918
	1919
	1920	static
	1921	void basic_mul(const mat_window_zz_p& X,
	1922	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	1923	{
	1924	long n = A.NumRows();
	1925	long l = A.NumCols();
	1926	long m = B.NumCols();
	1927
	1928	long p = zz_p::modulus();
	1929	mulmod_t pinv = zz_p::ModulusInverse();
	1930
	1931	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	1932
	1933	NTL_GEXEC_RANGE(seq, n, first, last) {
	1934
	1935	for (long i = first; i < last; i++) {
	1936	long j, k;
	1937	const zz_p* ap = &A[i][0];
	1938
	1939	zz_p *xp = &X[i][0];
	1940	for (j = 0; j < m; j++) xp[j].LoopHole() = 0;
	1941
	1942	for (k = 0; k < l; k++) {
	1943	long aa = rep(ap[k]);
	1944	if (aa != 0) {
	1945	const zz_p* bp = &B[k][0];
	1946	long T1;
	1947	mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
	1948
	1949	for (j = 0; j < m; j++) {
	1950	T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
	1951	xp[j].LoopHole() = AddMod(rep(xp[j]), T1, p);
	1952	}
	1953	}
	1954	}
	1955	}
	1956
	1957	} NTL_GEXEC_RANGE_END
	1958	}
	1959
	1960
	1961
	1962
	1963	#ifdef NTL_HAVE_LL_TYPE
	1964
	1965	static
	1966	void alt_mul_L(const mat_window_zz_p& X,
	1967	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	1968	{
	1969	long n = A.NumRows();
	1970	long l = A.NumCols();
	1971	long m = B.NumCols();
	1972
	1973	long p = zz_p::modulus();
	1974	sp_reduce_struct red_struct = zz_p::red_struct();
	1975
	1976	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	1977
	1978	NTL_GEXEC_RANGE(seq, m, first, last) {
	1979
	1980	Vec<long> B_col;
	1981	B_col.SetLength(l);
	1982	long *bp = B_col.elts();
	1983
	1984	long i, j, k;
	1985
	1986	for (j = first; j < last; j++) {
	1987	for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
	1988
	1989	for (i = 0; i < n; i++) {
	1990	const zz_p *ap = &A[i][0];
	1991	X[i][j].LoopHole() = InnerProd_L(bp, ap, l, p, red_struct);
	1992	}
	1993	}
	1994
	1995	} NTL_GEXEC_RANGE_END
	1996	}
	1997
	1998
	1999	static
	2000	void alt_mul_LL(const mat_window_zz_p& X,
	2001	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2002	{
	2003	long n = A.NumRows();
	2004	long l = A.NumCols();
	2005	long m = B.NumCols();
	2006
	2007	long p = zz_p::modulus();
	2008	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	2009
	2010	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	2011
	2012	NTL_GEXEC_RANGE(seq, m, first, last) {
	2013
	2014	Vec<long> B_col;
	2015	B_col.SetLength(l);
	2016	long *bp = B_col.elts();
	2017
	2018	long i, j, k;
	2019
	2020	for (j = first; j < last; j++) {
	2021	for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
	2022
	2023	for (i = 0; i < n; i++) {
	2024	const zz_p *ap = &A[i][0];
	2025	X[i][j].LoopHole() = InnerProd_LL(bp, ap, l, p, ll_red_struct);
	2026	}
	2027	}
	2028
	2029	} NTL_GEXEC_RANGE_END
	2030	}
	2031
	2032
	2033	#ifdef NTL_HAVE_AVX
	2034
	2035	static
	2036	void blk_mul_DD(const mat_window_zz_p& X,
	2037	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2038	{
	2039	long n = A.NumRows();
	2040	long l = A.NumCols();
	2041	long m = B.NumCols();
	2042
	2043	long p = zz_p::modulus();
	2044	sp_reduce_struct red_struct = zz_p::red_struct();
	2045
	2046	UniqueArray< AlignedArray<double> > A_buf;
	2047	long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2048	A_buf.SetLength(npanels);
	2049
	2050	for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2051	long k_max = min(kk+MAT_BLK_SZ, l);
	2052
	2053	A_buf[panel].SetLength(n * MAT_BLK_SZ);
	2054	double *abp = &A_buf[panel][0];
	2055
	2056	for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
	2057	const zz_p *ap1 = &A[i][0];
	2058	for (long k = kk; k < k_max; k++) {
	2059	abp[k-kk] = rep(ap1[k]);
	2060	}
	2061	for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
	2062	abp[k-kk] = 0;
	2063	}
	2064	}
	2065	}
	2066
	2067	long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2068
	2069	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	2070
	2071	NTL_GEXEC_RANGE(seq, nxpanels, first, last)
	2072	NTL_IMPORT(n)
	2073	NTL_IMPORT(l)
	2074	NTL_IMPORT(m)
	2075	NTL_IMPORT(p)
	2076	NTL_IMPORT(red_struct)
	2077
	2078	AlignedArray<double> B_rec;
	2079	B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	2080	double *brec = B_rec.get();
	2081
	2082	AlignedArray<double> X_buf;
	2083	X_buf.SetLength(n*MAT_BLK_SZ);
	2084	double *xbp = X_buf.get();
	2085
	2086	long jj, kk;
	2087	long i, j, k;
	2088	long panel;
	2089	long xpanel;
	2090
	2091	for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
	2092	xpanel++, jj += MAT_BLK_SZ) {
	2093
	2094	long j_max = min(jj+MAT_BLK_SZ, m);
	2095
	2096	for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
	2097
	2098	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	2099	long red_count = red_trigger;
	2100
	2101	for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2102	long k_max = min(kk+MAT_BLK_SZ, l);
	2103
	2104	for (k = kk; k < k_max; k++) {
	2105	const zz_p *bp = &B[k][0];
	2106	for (j = jj; j < j_max; j++)
	2107	brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = rep(bp[j]);
	2108	for (j = j_max; j < jj+MAT_BLK_SZ; j++)
	2109	brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = 0;
	2110	}
	2111
	2112
	2113	if (red_count-MAT_BLK_SZ < 0) {
	2114	red_count = red_trigger;
	2115	for (i = 0; i < n*MAT_BLK_SZ; i++)
	2116	xbp[i] = rem((unsigned long)(long)xbp[i], p, red_struct);
	2117	}
	2118
	2119	red_count = red_count-MAT_BLK_SZ;
	2120
	2121	const double *abp = &A_buf[panel][0];
	2122
	2123	muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk);
	2124	}
	2125
	2126
	2127	for (i = 0; i < n; i++) {
	2128	zz_p *xp = &X[i][0];
	2129	for (j = jj; j < j_max; j++)
	2130	xp[j].LoopHole() =
	2131	rem((unsigned long)(long)xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
	2132	}
	2133	}
	2134
	2135	NTL_GEXEC_RANGE_END
	2136	}
	2137
	2138	#endif
	2139
	2140
	2141	static
	2142	void blk_mul_LL(const mat_window_zz_p& X,
	2143	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2144	{
	2145	long n = A.NumRows();
	2146	long l = A.NumCols();
	2147	long m = B.NumCols();
	2148
	2149	long p = zz_p::modulus();
	2150	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	2151
	2152	Vec< Vec<long> > A_buf;
	2153	Vec<long *> abufp;
	2154	long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2155	A_buf.SetLength(npanels);
	2156	abufp.SetLength(npanels);
	2157
	2158	for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2159	long k_max = min(kk+MAT_BLK_SZ, l);
	2160
	2161	A_buf[panel].SetLength(n * MAT_BLK_SZ);
	2162	long *abp = A_buf[panel].elts();
	2163	abufp[panel] = abp;
	2164
	2165	for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
	2166	const zz_p *ap1 = &A[i][0];
	2167	for (long k = kk; k < k_max; k++) {
	2168	abp[k-kk] = rep(ap1[k]);
	2169	}
	2170	for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
	2171	abp[k-kk] = 0;
	2172	}
	2173	}
	2174	}
	2175
	2176	long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2177
	2178	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	2179
	2180	NTL_GEXEC_RANGE(seq, nxpanels, first, last)
	2181	NTL_IMPORT(n)
	2182	NTL_IMPORT(l)
	2183	NTL_IMPORT(m)
	2184	NTL_IMPORT(p)
	2185	NTL_IMPORT(ll_red_struct)
	2186
	2187	UniqueArray<long> B_rec;
	2188	B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	2189	long *brec = B_rec.get();
	2190
	2191	UniqueArray<long> X_buf;
	2192	X_buf.SetLength(n*MAT_BLK_SZ);
	2193	long *xbp = X_buf.get();
	2194
	2195	long jj, kk;
	2196	long i, j, k;
	2197	long panel;
	2198	long xpanel;
	2199
	2200	for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
	2201	xpanel++, jj += MAT_BLK_SZ) {
	2202
	2203	long j_max = min(jj+MAT_BLK_SZ, m);
	2204
	2205	for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
	2206
	2207	for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2208	long k_max = min(kk+MAT_BLK_SZ, l);
	2209
	2210	// fill brec, transposed
	2211
	2212	for (k = kk; k < k_max; k++) {
	2213	const zz_p *bp = &B[k][0];
	2214	for (j = jj; j < j_max; j++)
	2215	brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
	2216	for (j = j_max; j < jj+MAT_BLK_SZ; j++)
	2217	brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
	2218	}
	2219
	2220	const long *abp = abufp[panel];
	2221	muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk, p, ll_red_struct);
	2222	}
	2223
	2224
	2225	for (i = 0; i < n; i++) {
	2226	zz_p *xp = &X[i][0];
	2227	for (j = jj; j < j_max; j++)
	2228	xp[j].LoopHole() = xbp[i*MAT_BLK_SZ + (j-jj)];
	2229	}
	2230	}
	2231
	2232	NTL_GEXEC_RANGE_END
	2233	}
	2234
	2235
	2236	static
	2237	void blk_mul_L(const mat_window_zz_p& X,
	2238	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2239	{
	2240	long n = A.NumRows();
	2241	long l = A.NumCols();
	2242	long m = B.NumCols();
	2243
	2244	long p = zz_p::modulus();
	2245	sp_reduce_struct red_struct = zz_p::red_struct();
	2246
	2247	Vec< Vec<uhlong> > A_buf;
	2248	Vec<uhlong*> abufp;
	2249	long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2250	A_buf.SetLength(npanels);
	2251	abufp.SetLength(npanels);
	2252
	2253	for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2254	long k_max = min(kk+MAT_BLK_SZ, l);
	2255
	2256	A_buf[panel].SetLength(n * MAT_BLK_SZ);
	2257	uhlong *abp = A_buf[panel].elts();
	2258	abufp[panel] = abp;
	2259
	2260	for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
	2261	const zz_p *ap1 = &A[i][0];
	2262	for (long k = kk; k < k_max; k++) {
	2263	abp[k-kk] = rep(ap1[k]);
	2264	}
	2265	for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
	2266	abp[k-kk] = 0;
	2267	}
	2268	}
	2269	}
	2270
	2271	long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	2272
	2273	const bool seq = double(n)double(l)double(m) < PAR_THRESH;
	2274
	2275	NTL_GEXEC_RANGE(seq, nxpanels, first, last)
	2276	NTL_IMPORT(n)
	2277	NTL_IMPORT(l)
	2278	NTL_IMPORT(m)
	2279	NTL_IMPORT(p)
	2280	NTL_IMPORT(red_struct)
	2281
	2282	UniqueArray<uhlong> B_rec;
	2283	B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	2284	uhlong *brec = B_rec.get();
	2285
	2286	UniqueArray<unsigned long> X_buf;
	2287	X_buf.SetLength(n*MAT_BLK_SZ);
	2288	unsigned long *xbp = X_buf.get();
	2289
	2290	long jj, kk;
	2291	long i, j, k;
	2292	long panel;
	2293	long xpanel;
	2294
	2295	for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
	2296	xpanel++, jj += MAT_BLK_SZ) {
	2297
	2298	long j_max = min(jj+MAT_BLK_SZ, m);
	2299
	2300	for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
	2301
	2302	unsigned long ured_trigger =
	2303	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	2304	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	2305
	2306	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	2307
	2308	long red_count = red_trigger;
	2309
	2310	for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
	2311	long k_max = min(kk+MAT_BLK_SZ, l);
	2312
	2313	// fill brec, transposed
	2314
	2315	for (k = kk; k < k_max; k++) {
	2316	const zz_p *bp = &B[k][0];
	2317	for (j = jj; j < j_max; j++)
	2318	brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
	2319	for (j = j_max; j < jj+MAT_BLK_SZ; j++)
	2320	brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
	2321	}
	2322
	2323	if (red_count-MAT_BLK_SZ < 0) {
	2324	red_count = red_trigger;
	2325	for (i = 0; i < n*MAT_BLK_SZ; i++)
	2326	xbp[i] = rem(xbp[i], p, red_struct);
	2327	}
	2328
	2329	red_count = red_count-MAT_BLK_SZ;
	2330
	2331	const uhlong *abp = abufp[panel];
	2332
	2333	muladd_all_by_32(0, n, xbp, abp, brec, k_max-kk);
	2334	}
	2335
	2336
	2337	for (i = 0; i < n; i++) {
	2338	zz_p *xp = &X[i][0];
	2339	for (j = jj; j < j_max; j++)
	2340	xp[j].LoopHole() =
	2341	rem(xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
	2342	}
	2343	}
	2344
	2345	NTL_GEXEC_RANGE_END
	2346	}
	2347
	2348
	2349	#endif
	2350
	2351
	2352
	2353
	2354	static
	2355	void mul_base (const mat_window_zz_p& X,
	2356	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2357	{
	2358	long n = A.NumRows();
	2359	long l = A.NumCols();
	2360	long m = B.NumCols();
	2361
	2362	if (n == 0 \|\| l == 0 \|\| m == 0) {
	2363	clear(X);
	2364	return;
	2365	}
	2366
	2367
	2368	#ifndef NTL_HAVE_LL_TYPE
	2369
	2370	basic_mul(X, A, B);
	2371
	2372	#else
	2373
	2374	if (l < 32) {
	2375	//cerr << "basic_mul\n";
	2376	basic_mul(X, A, B);
	2377	return;
	2378	}
	2379
	2380	long p = zz_p::modulus();
	2381
	2382	if (n/MAT_BLK_SZ < 4 \|\| l/MAT_BLK_SZ < 4 \|\| m/MAT_BLK_SZ < 4) {
	2383	if (cast_unsigned(l) <= (~(0UL))/cast_unsigned(p-1) &&
	2384	cast_unsigned(l)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
	2385	//cerr << "alt_mul_L\n";
	2386	alt_mul_L(X, A, B);
	2387	}
	2388	else {
	2389	//cerr << "alt_mul_LL\n";
	2390	alt_mul_LL(X, A, B);
	2391	}
	2392
	2393	return;
	2394	}
	2395
	2396	{
	2397	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
	2398	if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
	2399	if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
	2400
	2401	long V = MAT_BLK_SZ*4;
	2402
	2403	#ifdef NTL_HAVE_AVX
	2404	if (p-1 <= MAX_DBL_INT &&
	2405	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	2406	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	2407
	2408	// cerr << "block_mul_DD\n";
	2409	blk_mul_DD(X, A, B);
	2410	}
	2411	else
	2412	#endif
	2413	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	2414	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	2415
	2416	//cerr << "blk_mul_L\n";
	2417	blk_mul_L(X, A, B);
	2418
	2419	}
	2420	else {
	2421
	2422	//cerr << "blk_mul_LL\n";
	2423	blk_mul_LL(X, A, B);
	2424	}
	2425	}
	2426
	2427	#endif
	2428
	2429
	2430	}
	2431
	2432	// The following implementation of Strassen is derived directly
	2433	// from the implementation in FLINT v2.5.2 (see http://www.flintlib.org),
	2434	// although a number of details have changed.
	2435	// I include the original copyright notice from the file nmod_mat/mul_strassen.c
	2436	// in the FLINT distribution.
	2437
	2438	/*=============================================================================
	2439
	2440	This file is part of FLINT.
	2441
	2442	FLINT is free software; you can redistribute it and/or modify
	2443	it under the terms of the GNU General Public License as published by
	2444	the Free Software Foundation; either version 2 of the License, or
	2445	(at your option) any later version.
	2446
	2447	FLINT is distributed in the hope that it will be useful,
	2448	but WITHOUT ANY WARRANTY; without even the implied warranty of
	2449	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	2450	GNU General Public License for more details.
	2451
	2452	You should have received a copy of the GNU General Public License
	2453	along with FLINT; if not, write to the Free Software
	2454	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	2455
	2456	=============================================================================*/
	2457	/******************************************************************************
	2458
	2459	Copyright (C) 2008, Martin Albrecht
	2460	Copyright (C) 2008, 2009 William Hart.
	2461	Copyright (C) 2010, Fredrik Johansson
	2462
	2463	******************************************************************************/
	2464
	2465
	2466
	2467
	2468	void mul_strassen(const mat_window_zz_p& C,
	2469	const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
	2470	{
	2471	long a, b, c;
	2472	long anr, anc, bnr, bnc;
	2473
	2474
	2475	a = A.NumRows();
	2476	b = A.NumCols();
	2477	c = B.NumCols();
	2478
	2479
	2480	bool use_DD = false;
	2481	// this code determines if mul_base triggers blk_mul_DD,
	2482	// in which case a higher crossover is used
	2483
	2484	#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
	2485	{
	2486	long V = MAT_BLK_SZ*4;
	2487	long p = zz_p::modulus();
	2488
	2489	if (p-1 <= MAX_DBL_INT &&
	2490	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	2491	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
	2492	{
	2493	use_DD = true;
	2494	}
	2495	}
	2496	#endif
	2497
	2498	long nt = AvailableThreads();
	2499
	2500	long xover;
	2501	// now we set the crossover -- it is kind of a heauristic
	2502	// mess based on nt and use_DD...I've run some tests to
	2503	// make sure these settings are reasonable, but a more
	2504	// rational approach would be preferable
	2505
	2506	if (nt > 1) {
	2507	if (use_DD \|\| nt > 8192/(2*MAT_BLK_SZ))
	2508	xover = 8192;
	2509	else
	2510	xover = max(800, nt2MAT_BLK_SZ);
	2511	}
	2512	else {
	2513	if (use_DD)
	2514	xover = 800;
	2515	else
	2516	xover = 448;
	2517	}
	2518
	2519	if (a <= xover \|\| b <= xover \|\| c <= xover)
	2520	{
	2521	mul_base(C, A, B);
	2522	return;
	2523	}
	2524
	2525	anr = a / 2;
	2526	anc = b / 2;
	2527	bnr = anc;
	2528	bnc = c / 2;
	2529
	2530	const_mat_window_zz_p A11(A, 0, 0, anr, anc);
	2531	const_mat_window_zz_p A12(A, 0, anc, anr, 2*anc);
	2532	const_mat_window_zz_p A21(A, anr, 0, 2*anr, anc);
	2533	const_mat_window_zz_p A22(A, anr, anc, 2anr, 2anc);
	2534
	2535	const_mat_window_zz_p B11(B, 0, 0, bnr, bnc);
	2536	const_mat_window_zz_p B12(B, 0, bnc, bnr, 2*bnc);
	2537	const_mat_window_zz_p B21(B, bnr, 0, 2*bnr, bnc);
	2538	const_mat_window_zz_p B22(B, bnr, bnc, 2bnr, 2bnc);
	2539
	2540	mat_window_zz_p C11(C, 0, 0, anr, bnc);
	2541	mat_window_zz_p C12(C, 0, bnc, anr, 2*bnc);
	2542	mat_window_zz_p C21(C, anr, 0, 2*anr, bnc);
	2543	mat_window_zz_p C22(C, anr, bnc, 2anr, 2bnc);
	2544
	2545	mat_zz_p X1_store;
	2546	X1_store.SetDims(anr, max(bnc, anc));
	2547
	2548	mat_window_zz_p X1a(X1_store, 0, 0, anr, anc);
	2549	mat_window_zz_p X1b(X1_store, 0, 0, anr, bnc);
	2550
	2551	mat_zz_p X2;
	2552	X2.SetDims(anc, bnc);
	2553
	2554	/*
	2555	See Jean-Guillaume Dumas, Clement Pernet, Wei Zhou; "Memory
	2556	efficient scheduling of Strassen-Winograd's matrix multiplication
	2557	algorithm"; http://arxiv.org/pdf/0707.2347v3 for reference on the
	2558	used operation scheduling.
	2559	*/
	2560
	2561	sub(X1a, A11, A21);
	2562	sub(X2, B22, B12);
	2563	mul_strassen(C21, X1a, X2);
	2564
	2565	add(X1a, A21, A22);
	2566	sub(X2, B12, B11);
	2567	mul_strassen(C22, X1a, X2);
	2568
	2569	sub(X1a, X1a, A11);
	2570	sub(X2, B22, X2);
	2571	mul_strassen(C12, X1a, X2);
	2572
	2573	sub(X1a, A12, X1a);
	2574	mul_strassen(C11, X1a, B22);
	2575
	2576
	2577	mul_strassen(X1b, A11, B11);
	2578
	2579	add(C12, X1b, C12);
	2580	add(C21, C12, C21);
	2581	add(C12, C12, C22);
	2582	add(C22, C21, C22);
	2583	add(C12, C12, C11);
	2584	sub(X2, X2, B21);
	2585	mul_strassen(C11, A22, X2);
	2586
	2587	X2.kill();
	2588
	2589	sub(C21, C21, C11);
	2590	mul_strassen(C11, A12, B21);
	2591
	2592	add(C11, X1b, C11);
	2593
	2594	X1_store.kill();
	2595
	2596	if (c > 2bnc) / A by last col of B -> last col of C */
	2597	{
	2598	const_mat_window_zz_p Bc(B, 0, 2*bnc, b, c);
	2599	mat_window_zz_p Cc(C, 0, 2*bnc, a, c);
	2600
	2601	mul_strassen(Cc, A, Bc);
	2602	}
	2603
	2604	if (a > 2anr) / last row of A by B -> last row of C */
	2605	{
	2606	const_mat_window_zz_p Ar(A, 2*anr, 0, a, b);
	2607	mat_window_zz_p Cr(C, 2*anr, 0, a, c);
	2608	mul_strassen(Cr, Ar, B);
	2609	}
	2610
	2611	if (b > 2anc) / last col of A by last row of B -> C */
	2612	{
	2613	const_mat_window_zz_p Ac(A, 0, 2anc, 2anr, b);
	2614	const_mat_window_zz_p Br(B, 2bnr, 0, b, 2bnc);
	2615	mat_window_zz_p Cb(C, 0, 0, 2anr, 2bnc);
	2616
	2617	// Cb += Ac*Br
	2618	mat_zz_p tmp;
	2619	tmp.SetDims(Cb.NumRows(), Cb.NumCols());
	2620	mul_strassen(tmp, Ac, Br);
	2621	add(Cb, Cb, tmp);
	2622	}
	2623	}
	2624
	2625
	2626
	2627
	2628
	2629
	2630
	2631	static
	2632	void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
	2633	{
	2634	long n = A.NumRows();
	2635	long l = A.NumCols();
	2636	long m = B.NumCols();
	2637
	2638	if (l != B.NumRows())
	2639	LogicError("matrix mul: dimension mismatch");
	2640
	2641	X.SetDims(n, m);
	2642
	2643	if (n == 0 \|\| l == 0 \|\| m == 0) {
	2644	clear(X);
	2645	return;
	2646	}
	2647
	2648	mul_strassen(X, A, B);
	2649	}
	2650
	2651
	2652	void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
	2653	{
	2654	if (&X == &A \|\| &X == &B) {
	2655	mat_zz_p tmp;
	2656	mul_aux(tmp, A, B);
	2657	X = tmp;
	2658	}
	2659	else
	2660	mul_aux(X, A, B);
	2661	}
	2662
	2663
	2664	// ******************************************************************
	2665	//
	2666	// Matrix inversion code
	2667	//
	2668	// ******************************************************************
	2669
	2670	static
	2671	long relaxed_InvModStatus(long& x, long a, long n, bool relax)
	2672	{
	2673	if (relax) {
	2674	return InvModStatus(x, a, n);
	2675	}
	2676	else {
	2677	x = InvMod(a, n);
	2678	return 0;
	2679	}
	2680	}
	2681
	2682	static
	2683	void basic_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	2684	{
	2685	long n = A.NumRows();
	2686
	2687	if (A.NumCols() != n)
	2688	LogicError("inv: nonsquare matrix");
	2689
	2690	if (n == 0) {
	2691	set(d);
	2692	X.SetDims(0, 0);
	2693	return;
	2694	}
	2695
	2696
	2697	Mat<long> M;
	2698	conv(M, A);
	2699	// scratch space
	2700
	2701	Vec<long> P;
	2702	P.SetLength(n);
	2703	for (long k = 0; k < n; k++) P[k] = k;
	2704	// records swap operations
	2705
	2706	long det;
	2707	det = 1;
	2708
	2709	long p = zz_p::modulus();
	2710	mulmod_t pinv = zz_p::ModulusInverse();
	2711
	2712	bool seq = n < PAR_THRESH_SQ;
	2713
	2714	bool pivoting = false;
	2715
	2716	for (long k = 0; k < n; k++) {
	2717	long pos = -1;
	2718	long pivot_inv;
	2719	for (long i = k; i < n; i++) {
	2720	// NOTE: by using InvModStatus, this code will work
	2721	// for prime-powers as well as primes
	2722	long pivot = M[i][k];
	2723	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	2724	pos = i;
	2725	break;
	2726	}
	2727	}
	2728
	2729	if (pos != -1) {
	2730	if (k != pos) {
	2731	swap(M[pos], M[k]);
	2732	det = NegateMod(det, p);
	2733	P[k] = pos;
	2734	pivoting = true;
	2735	}
	2736
	2737	det = MulMod(det, M[k][k], p);
	2738
	2739	{
	2740	// multiply row k by pivot_inv
	2741	long t1 = pivot_inv;
	2742	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	2743	long * NTL_RESTRICT y = &M[k][0];
	2744	for (long j = 0; j < n; j++)
	2745	y[j] = MulModPrecon(y[j], t1, p, t1pinv);
	2746
	2747	y[k] = pivot_inv;
	2748	}
	2749
	2750
	2751
	2752	NTL_GEXEC_RANGE(seq, n, first, last)
	2753	NTL_IMPORT(p)
	2754	NTL_IMPORT(n)
	2755	NTL_IMPORT(k)
	2756	long * NTL_RESTRICT y = &M[k][0];
	2757	for (long i = first; i < last; i++) {
	2758	if (i == k) continue; // skip row k
	2759
	2760	long * NTL_RESTRICT x = &M[i][0];
	2761	long t1 = x[k];
	2762	t1 = NegateMod(t1, p);
	2763	x[k] = 0;
	2764	if (t1 == 0) continue;
	2765
	2766	// add t1 * row k to row i
	2767	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	2768
	2769	for (long j = 0; j < n; j++) {
	2770	long t2 = MulModPrecon(y[j], t1, p, t1pinv);
	2771	x[j] = AddMod(x[j], t2, p);
	2772	}
	2773	}
	2774	NTL_GEXEC_RANGE_END
	2775	}
	2776	else {
	2777	clear(d);
	2778	return;
	2779	}
	2780	}
	2781
	2782	if (pivoting) {
	2783	// pivot colums, using reverse swap sequence
	2784
	2785	for (long i = 0; i < n; i++) {
	2786	long * NTL_RESTRICT x = &M[i][0];
	2787
	2788	for (long k = n-1; k >= 0; k--) {
	2789	long pos = P[k];
	2790	if (pos != k) _ntl_swap(x[pos], x[k]);
	2791	}
	2792	}
	2793	}
	2794
	2795	X.SetDims(n, n);
	2796	for (long i = 0; i < n; i++)
	2797	for (long j = 0; j < n; j++)
	2798	X[i][j].LoopHole() = M[i][j];
	2799
	2800	d.LoopHole() = det;
	2801	}
	2802
	2803
	2804
	2805	#ifdef NTL_HAVE_LL_TYPE
	2806
	2807
	2808
	2809	static
	2810	void alt_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	2811	{
	2812	long n = A.NumRows();
	2813
	2814	if (A.NumCols() != n)
	2815	LogicError("inv: nonsquare matrix");
	2816
	2817	if (n == 0) {
	2818	set(d);
	2819	X.SetDims(0, 0);
	2820	return;
	2821	}
	2822
	2823
	2824	Mat<unsigned long> M;
	2825	conv(M, A);
	2826	// scractch space
	2827
	2828	Vec<long> P;
	2829	P.SetLength(n);
	2830	for (long k = 0; k < n; k++) P[k] = k;
	2831	// records swap operations
	2832
	2833	long det;
	2834	det = 1;
	2835
	2836	long p = zz_p::modulus();
	2837	mulmod_t pinv = zz_p::ModulusInverse();
	2838	sp_reduce_struct red_struct = zz_p::red_struct();
	2839
	2840
	2841
	2842	bool seq = n < PAR_THRESH_SQ;
	2843
	2844	bool pivoting = false;
	2845
	2846	unsigned long ured_trigger =
	2847	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	2848	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	2849
	2850	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	2851
	2852	long red_count = red_trigger;
	2853
	2854
	2855	for (long k = 0; k < n; k++) {
	2856	bool cleanup = false;
	2857
	2858	if (red_count-1 < 0) {
	2859	red_count = red_trigger;
	2860	cleanup = true;
	2861	}
	2862
	2863	red_count = red_count-1;
	2864
	2865	long pos = -1;
	2866	long pivot;
	2867	long pivot_inv;
	2868
	2869	for (long i = k; i < n; i++) {
	2870	// NOTE: by using InvModStatus, this code will work
	2871	// for prime-powers as well as primes
	2872	pivot = rem(M[i][k], p, red_struct);
	2873	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	2874	pos = i;
	2875	break;
	2876	}
	2877	}
	2878
	2879	if (pos != -1) {
	2880	if (k != pos) {
	2881	swap(M[pos], M[k]);
	2882	det = NegateMod(det, p);
	2883	P[k] = pos;
	2884	pivoting = true;
	2885	}
	2886
	2887	det = MulMod(det, pivot, p);
	2888
	2889	{
	2890	// multiply row k by pivot_inv
	2891	long t1 = pivot_inv;
	2892	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
	2893	unsigned long * NTL_RESTRICT y = &M[k][0];
	2894	for (long j = 0; j < n; j++) {
	2895	long t2 = rem(y[j], p, red_struct);
	2896	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	2897	}
	2898
	2899	y[k] = pivot_inv;
	2900	}
	2901
	2902
	2903	NTL_GEXEC_RANGE(seq, n, first, last)
	2904	NTL_IMPORT(p)
	2905	NTL_IMPORT(n)
	2906	NTL_IMPORT(k)
	2907	NTL_IMPORT(red_struct)
	2908	unsigned long * NTL_RESTRICT y = &M[k][0];
	2909	if (cleanup) {
	2910	for (long i = first; i < last; i++) {
	2911	if (i == k) continue;
	2912	// skip row k: the data won't change, but it
	2913	// technically is a race condition in a multi-theaded
	2914	// execution, and it would violate the "restrict"
	2915	// contract
	2916
	2917	unsigned long * NTL_RESTRICT x = &M[i][0];
	2918	for (long j = 0; j < n; j++) {
	2919	x[j] = rem(x[j], p, red_struct);
	2920	}
	2921	}
	2922	}
	2923
	2924
	2925	for (long i = first; i < last; i++) {
	2926	if (i == k) continue; // skip row k
	2927
	2928	unsigned long * NTL_RESTRICT x = &M[i][0];
	2929	long t1 = rem(x[k], p, red_struct);
	2930	t1 = NegateMod(t1, p);
	2931	x[k] = 0;
	2932	if (t1 == 0) continue;
	2933
	2934	// add t1 * row k to row i
	2935	unsigned long ut1 = t1;
	2936	long j;
	2937	for (j = 0; j <= n-4; j+=4) {
	2938	unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
	2939	unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
	2940	unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
	2941	unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
	2942	x[j+0] = xj0;
	2943	x[j+1] = xj1;
	2944	x[j+2] = xj2;
	2945	x[j+3] = xj3;
	2946	}
	2947	for (; j < n; j++) {
	2948	x[j] += DO_MUL(y[j], ut1);
	2949	}
	2950	}
	2951	NTL_GEXEC_RANGE_END
	2952	}
	2953	else {
	2954	clear(d);
	2955	return;
	2956	}
	2957	}
	2958
	2959	if (pivoting) {
	2960	// pivot colums, using reverse swap sequence
	2961
	2962	for (long i = 0; i < n; i++) {
	2963	unsigned long * NTL_RESTRICT x = &M[i][0];
	2964
	2965	for (long k = n-1; k >= 0; k--) {
	2966	long pos = P[k];
	2967	if (pos != k) _ntl_swap(x[pos], x[k]);
	2968	}
	2969	}
	2970	}
	2971
	2972	X.SetDims(n, n);
	2973	for (long i = 0; i < n; i++)
	2974	for (long j = 0; j < n; j++)
	2975	X[i][j].LoopHole() = rem(M[i][j], p, red_struct);
	2976
	2977	d.LoopHole() = det;
	2978	}
	2979
	2980
	2981
	2982
	2983
	2984	#ifdef NTL_HAVE_AVX
	2985
	2986	static
	2987	void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	2988	{
	2989	long n = A.NumRows();
	2990
	2991	if (A.NumCols() != n)
	2992	LogicError("inv: nonsquare matrix");
	2993
	2994	if (n == 0) {
	2995	set(d);
	2996	X.SetDims(0, 0);
	2997	return;
	2998	}
	2999
	3000	Vec< AlignedArray<double> > M;
	3001	M.SetLength(n);
	3002	for (long i = 0; i < n; i++) M[i].SetLength(n);
	3003
	3004	for (long i = 0; i < n; i++) {
	3005	for (long j = 0; j < n; j++)
	3006	M[i][j] = rep(A[i][j]);
	3007	}
	3008
	3009
	3010	Vec<long> P;
	3011	P.SetLength(n);
	3012	for (long k = 0; k < n; k++) P[k] = k;
	3013	// records swap operations
	3014
	3015	long det;
	3016	det = 1;
	3017
	3018	long p = zz_p::modulus();
	3019	mulmod_t pinv = zz_p::ModulusInverse();
	3020	sp_reduce_struct red_struct = zz_p::red_struct();
	3021
	3022
	3023
	3024	bool seq = n < PAR_THRESH_SQ;
	3025
	3026	bool pivoting = false;
	3027
	3028	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	3029	long red_count = red_trigger;
	3030
	3031	for (long k = 0; k < n; k++) {
	3032	bool cleanup = false;
	3033
	3034	if (red_count-1 < 0) {
	3035	red_count = red_trigger;
	3036	cleanup = true;
	3037	}
	3038
	3039	red_count = red_count-1;
	3040
	3041	long pos = -1;
	3042	long pivot;
	3043	long pivot_inv;
	3044
	3045
	3046
	3047	for (long i = k; i < n; i++) {
	3048	// NOTE: by using InvModStatus, this code will work
	3049	// for prime-powers as well as primes
	3050	pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
	3051	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	3052	pos = i;
	3053	break;
	3054	}
	3055	}
	3056
	3057	if (pos != -1) {
	3058	if (k != pos) {
	3059	swap(M[pos], M[k]);
	3060	det = NegateMod(det, p);
	3061	P[k] = pos;
	3062	pivoting = true;
	3063	}
	3064
	3065	det = MulMod(det, pivot, p);
	3066
	3067	{
	3068	// multiply row k by pivot_inv
	3069	long t1 = pivot_inv;
	3070	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
	3071	double * NTL_RESTRICT y = &M[k][0];
	3072	for (long j = 0; j < n; j++) {
	3073	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
	3074	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	3075	}
	3076
	3077	y[k] = pivot_inv;
	3078	}
	3079
	3080
	3081	NTL_GEXEC_RANGE(seq, n, first, last)
	3082	NTL_IMPORT(p)
	3083	NTL_IMPORT(n)
	3084	NTL_IMPORT(k)
	3085	NTL_IMPORT(red_struct)
	3086	double * NTL_RESTRICT y = &M[k][0];
	3087	if (cleanup) {
	3088	for (long i = first; i < last; i++) {
	3089	if (i == k) continue;
	3090	// skip row k: the data won't change, but it
	3091	// technically is a race condition in a multi-theaded
	3092	// execution, and it would violate the "restrict"
	3093	// contract
	3094
	3095	double * NTL_RESTRICT x = &M[i][0];
	3096	for (long j = 0; j < n; j++) {
	3097	x[j] = rem((unsigned long)(long)x[j], p, red_struct);
	3098	}
	3099	}
	3100	}
	3101
	3102
	3103	for (long i = first; i < last; i++) {
	3104	if (i == k) continue; // skip row k
	3105
	3106	double * NTL_RESTRICT x = &M[i][0];
	3107	long t1 = rem((unsigned long)(long)x[k], p, red_struct);
	3108	t1 = NegateMod(t1, p);
	3109	x[k] = 0;
	3110	if (t1 == 0) continue;
	3111
	3112	// add t1 * row k to row i
	3113	double ut1 = t1;
	3114	muladd_interval1(x, y, ut1, n);
	3115	}
	3116	NTL_GEXEC_RANGE_END
	3117	}
	3118	else {
	3119	clear(d);
	3120	return;
	3121	}
	3122	}
	3123
	3124
	3125	if (pivoting) {
	3126	// pivot colums, using reverse swap sequence
	3127
	3128	for (long i = 0; i < n; i++) {
	3129	double * NTL_RESTRICT x = &M[i][0];
	3130
	3131	for (long k = n-1; k >= 0; k--) {
	3132	long pos = P[k];
	3133	if (pos != k) _ntl_swap(x[pos], x[k]);
	3134	}
	3135	}
	3136	}
	3137
	3138
	3139	X.SetDims(n, n);
	3140	for (long i = 0; i < n; i++)
	3141	for (long j = 0; j < n; j++)
	3142	X[i][j].LoopHole() = rem((unsigned long)(long)M[i][j], p, red_struct);
	3143
	3144	d.LoopHole() = det;
	3145	}
	3146
	3147	#endif
	3148
	3149
	3150
	3151
	3152
	3153	#ifdef NTL_HAVE_AVX
	3154
	3155	static
	3156	void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	3157	{
	3158	long n = A.NumRows();
	3159
	3160	if (A.NumCols() != n)
	3161	LogicError("inv: nonsquare matrix");
	3162
	3163	if (n == 0) {
	3164	set(d);
	3165	X.SetDims(0, 0);
	3166	return;
	3167	}
	3168
	3169	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	3170
	3171	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	3172
	3173
	3174	Vec< AlignedArray<double> > M;
	3175	M.SetLength(npanels);
	3176	for (long panel = 0; panel < npanels; panel++) {
	3177	M[panel].SetLength(n*MAT_BLK_SZ);
	3178	double *panelp = &M[panel][0];
	3179
	3180	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	3181	}
	3182
	3183	// copy A into panels
	3184	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3185	long j_max = min(jj+MAT_BLK_SZ, n);
	3186	double *panelp = &M[panel][0];
	3187
	3188	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3189	const zz_p *ap = A[i].elts() + jj;
	3190
	3191	for (long j = jj; j < j_max; j++)
	3192	panelp[j-jj] = rep(ap[j-jj]);
	3193	}
	3194	}
	3195
	3196	Vec<long> P;
	3197	P.SetLength(n);
	3198	for (long k = 0; k < n; k++) P[k] = k;
	3199	// records swap operations
	3200
	3201
	3202	long det;
	3203	det = 1;
	3204
	3205	long p = zz_p::modulus();
	3206	mulmod_t pinv = zz_p::ModulusInverse();
	3207	sp_reduce_struct red_struct = zz_p::red_struct();
	3208
	3209
	3210	bool seq = double(n)double(n)double(MAT_BLK_SZ) < PAR_THRESH;
	3211
	3212	bool pivoting = false;
	3213
	3214	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	3215	long red_count = red_trigger;
	3216
	3217	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	3218	long k_max = min(kk+MAT_BLK_SZ, n);
	3219
	3220	bool cleanup = false;
	3221
	3222	if (red_count-MAT_BLK_SZ < 0) {
	3223	red_count = red_trigger;
	3224	cleanup = true;
	3225	}
	3226
	3227	red_count = red_count-MAT_BLK_SZ;
	3228	double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	3229
	3230	if (cleanup) {
	3231	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3232	kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
	3233	}
	3234
	3235	for (long k = kk; k < k_max; k++) {
	3236
	3237	long pos = -1;
	3238	long pivot;
	3239	long pivot_inv;
	3240
	3241	for (long i = k; i < n; i++) {
	3242	// NOTE: by using InvModStatus, this code will work
	3243	// for prime-powers as well as primes
	3244	pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	3245	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	3246	pos = i;
	3247	break;
	3248	}
	3249	}
	3250
	3251	if (pos == -1) {
	3252	clear(d);
	3253	return;
	3254	}
	3255
	3256	double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	3257	if (k != pos) {
	3258	// swap rows pos and k
	3259	double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	3260	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	3261
	3262	det = NegateMod(det, p);
	3263	P[k] = pos;
	3264	pivoting = true;
	3265	}
	3266
	3267	det = MulMod(det, pivot, p);
	3268
	3269	{
	3270	// multiply row k by pivot_inv
	3271	long t1 = pivot_inv;
	3272	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	3273	for (long j = 0; j < MAT_BLK_SZ; j++) {
	3274	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
	3275	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	3276	}
	3277
	3278	y[k-kk] = pivot_inv;
	3279	}
	3280
	3281	for (long i = 0; i < n; i++) {
	3282	if (i == k) continue; // skip row k
	3283
	3284	double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	3285	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
	3286	t1 = NegateMod(t1, p);
	3287	x[k-kk] = 0;
	3288	if (t1 == 0) continue;
	3289
	3290	// add t1 * row k to row i
	3291	double ut1 = t1;
	3292	muladd_interval(x, y, ut1, MAT_BLK_SZ);
	3293	}
	3294	}
	3295
	3296
	3297	// finished processing current kpanel
	3298	// next, reduce and apply to all other kpanels
	3299
	3300	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3301	kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
	3302
	3303	// special processing: subtract 1 off of diangonal
	3304
	3305	for (long k = kk; k < k_max; k++)
	3306	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3307
	3308
	3309	NTL_GEXEC_RANGE(seq, npanels, first, last)
	3310	NTL_IMPORT(p)
	3311	NTL_IMPORT(n)
	3312	NTL_IMPORT(red_struct)
	3313	NTL_IMPORT(kpanel)
	3314	NTL_IMPORT(kpanelp)
	3315	NTL_IMPORT(kk)
	3316	NTL_IMPORT(k_max)
	3317
	3318
	3319	AlignedArray<double> buf_store;
	3320	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	3321	double *buf = &buf_store[0];
	3322
	3323	for (long jpanel = first; jpanel < last; jpanel++) {
	3324	if (jpanel == kpanel) continue;
	3325
	3326	double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	3327
	3328	if (cleanup) {
	3329	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3330	jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
	3331	}
	3332
	3333	// perform swaps
	3334	for (long k = kk; k < k_max; k++) {
	3335	long pos = P[k];
	3336	if (pos != k) {
	3337	// swap rows pos and k
	3338	double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	3339	double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	3340	for (long j = 0; j < MAT_BLK_SZ; j++)
	3341	_ntl_swap(pos_p[j], k_p[j]);
	3342	}
	3343	}
	3344
	3345	// copy block number kpanel (the one on the diagonal) into buf
	3346
	3347	for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
	3348	buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
	3349
	3350	// jpanel += kpanel*buf
	3351
	3352	muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
	3353	}
	3354
	3355	NTL_GEXEC_RANGE_END
	3356
	3357	// special processing: add 1 back to the diangonal
	3358
	3359	for (long k = kk; k < k_max; k++)
	3360	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3361
	3362	}
	3363
	3364	if (pivoting) {
	3365	// pivot colums, using reverse swap sequence
	3366
	3367	for (long k = n-1; k >= 0; k--) {
	3368	long pos = P[k];
	3369	if (pos != k) {
	3370	// swap columns pos and k
	3371
	3372	double * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	3373	double * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	3374	for (long i = 0; i < n; i++) {
	3375	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
	3376	}
	3377	}
	3378	}
	3379	}
	3380
	3381
	3382	// copy panels into X
	3383	X.SetDims(n, n);
	3384	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3385	long j_max = min(jj+MAT_BLK_SZ, n);
	3386	double *panelp = &M[panel][0];
	3387
	3388	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3389	zz_p *xp = X[i].elts() + jj;
	3390
	3391	for (long j = jj; j < j_max; j++)
	3392	xp[j-jj].LoopHole() = rem((unsigned long)(long)panelp[j-jj], p, red_struct);
	3393	}
	3394	}
	3395
	3396	d.LoopHole() = det;
	3397
	3398	}
	3399
	3400	#endif
	3401
	3402
	3403
	3404	static
	3405	void blk_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	3406	{
	3407	long n = A.NumRows();
	3408
	3409	if (A.NumCols() != n)
	3410	LogicError("inv: nonsquare matrix");
	3411
	3412	if (n == 0) {
	3413	set(d);
	3414	X.SetDims(0, 0);
	3415	return;
	3416	}
	3417
	3418	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	3419
	3420	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	3421
	3422	Vec< UniqueArray<unsigned long> > M;
	3423	M.SetLength(npanels);
	3424	for (long panel = 0; panel < npanels; panel++) {
	3425	M[panel].SetLength(n*MAT_BLK_SZ);
	3426	unsigned long *panelp = &M[panel][0];
	3427
	3428	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	3429	}
	3430
	3431	// copy A into panels
	3432	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3433	long j_max = min(jj+MAT_BLK_SZ, n);
	3434	unsigned long *panelp = &M[panel][0];
	3435
	3436	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3437	const zz_p *ap = A[i].elts() + jj;
	3438
	3439	for (long j = jj; j < j_max; j++)
	3440	panelp[j-jj] = rep(ap[j-jj]);
	3441	}
	3442	}
	3443
	3444	Vec<long> P;
	3445	P.SetLength(n);
	3446	for (long k = 0; k < n; k++) P[k] = k;
	3447	// records swap operations
	3448
	3449
	3450	long det;
	3451	det = 1;
	3452
	3453	long p = zz_p::modulus();
	3454	mulmod_t pinv = zz_p::ModulusInverse();
	3455	sp_reduce_struct red_struct = zz_p::red_struct();
	3456
	3457
	3458	bool seq = double(n)double(n)double(MAT_BLK_SZ) < PAR_THRESH;
	3459
	3460	bool pivoting = false;
	3461
	3462	unsigned long ured_trigger =
	3463	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	3464	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	3465
	3466	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	3467
	3468	long red_count = red_trigger;
	3469
	3470	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	3471	long k_max = min(kk+MAT_BLK_SZ, n);
	3472
	3473	bool cleanup = false;
	3474
	3475	if (red_count-MAT_BLK_SZ < 0) {
	3476	red_count = red_trigger;
	3477	cleanup = true;
	3478	}
	3479
	3480	red_count = red_count-MAT_BLK_SZ;
	3481	unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	3482
	3483	if (cleanup) {
	3484	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3485	kpanelp[r] = rem(kpanelp[r], p, red_struct);
	3486	}
	3487
	3488	for (long k = kk; k < k_max; k++) {
	3489
	3490	long pos = -1;
	3491	long pivot;
	3492	long pivot_inv;
	3493
	3494	for (long i = k; i < n; i++) {
	3495	// NOTE: by using InvModStatus, this code will work
	3496	// for prime-powers as well as primes
	3497	pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	3498	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	3499	pos = i;
	3500	break;
	3501	}
	3502	}
	3503
	3504	if (pos == -1) {
	3505	clear(d);
	3506	return;
	3507	}
	3508
	3509	unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	3510	if (k != pos) {
	3511	// swap rows pos and k
	3512	unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	3513	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	3514
	3515	det = NegateMod(det, p);
	3516	P[k] = pos;
	3517	pivoting = true;
	3518	}
	3519
	3520	det = MulMod(det, pivot, p);
	3521
	3522	{
	3523	// multiply row k by pivot_inv
	3524	long t1 = pivot_inv;
	3525	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	3526	for (long j = 0; j < MAT_BLK_SZ; j++) {
	3527	long t2 = rem(y[j], p, red_struct);
	3528	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	3529	}
	3530
	3531	y[k-kk] = pivot_inv;
	3532	}
	3533
	3534	for (long i = 0; i < n; i++) {
	3535	if (i == k) continue; // skip row k
	3536
	3537	unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	3538	long t1 = rem(x[k-kk], p, red_struct);
	3539	t1 = NegateMod(t1, p);
	3540	x[k-kk] = 0;
	3541	if (t1 == 0) continue;
	3542
	3543	// add t1 * row k to row i
	3544	unsigned long ut1 = t1;
	3545	muladd_interval(x, y, ut1, MAT_BLK_SZ);
	3546	}
	3547	}
	3548
	3549
	3550	// finished processing current kpanel
	3551	// next, reduce and apply to all other kpanels
	3552
	3553	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3554	kpanelp[r] = rem(kpanelp[r], p, red_struct);
	3555
	3556	// special processing: subtract 1 off of diangonal
	3557
	3558	for (long k = kk; k < k_max; k++)
	3559	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3560
	3561
	3562	NTL_GEXEC_RANGE(seq, npanels, first, last)
	3563	NTL_IMPORT(p)
	3564	NTL_IMPORT(n)
	3565	NTL_IMPORT(red_struct)
	3566	NTL_IMPORT(kpanel)
	3567	NTL_IMPORT(kpanelp)
	3568	NTL_IMPORT(kk)
	3569	NTL_IMPORT(k_max)
	3570
	3571
	3572	UniqueArray<unsigned long> buf_store;
	3573	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	3574	unsigned long *buf = &buf_store[0];
	3575
	3576	for (long jpanel = first; jpanel < last; jpanel++) {
	3577	if (jpanel == kpanel) continue;
	3578
	3579	unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	3580
	3581	if (cleanup) {
	3582	for (long r = 0; r < n*MAT_BLK_SZ; r++)
	3583	jpanelp[r] = rem(jpanelp[r], p, red_struct);
	3584	}
	3585
	3586	// perform swaps
	3587	for (long k = kk; k < k_max; k++) {
	3588	long pos = P[k];
	3589	if (pos != k) {
	3590	// swap rows pos and k
	3591	unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	3592	unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	3593	for (long j = 0; j < MAT_BLK_SZ; j++)
	3594	_ntl_swap(pos_p[j], k_p[j]);
	3595	}
	3596	}
	3597
	3598	// copy block number kpanel (the one on the diagonal) into buf
	3599	// here, we transpose it
	3600
	3601	for (long k = kk; k < k_max; k++)
	3602	for (long j = 0; j < MAT_BLK_SZ; j++)
	3603	buf[j*MAT_BLK_SZ + (k-kk)] =
	3604	rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
	3605
	3606	// jpanel += kpanel*buf
	3607
	3608	muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
	3609	}
	3610
	3611	NTL_GEXEC_RANGE_END
	3612
	3613	// special processing: add 1 back to the diangonal
	3614
	3615	for (long k = kk; k < k_max; k++)
	3616	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3617
	3618	}
	3619
	3620	if (pivoting) {
	3621	// pivot colums, using reverse swap sequence
	3622
	3623	for (long k = n-1; k >= 0; k--) {
	3624	long pos = P[k];
	3625	if (pos != k) {
	3626	// swap columns pos and k
	3627
	3628	unsigned long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	3629	unsigned long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	3630	for (long i = 0; i < n; i++) {
	3631	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
	3632	}
	3633	}
	3634	}
	3635	}
	3636
	3637	// copy panels into X
	3638	X.SetDims(n, n);
	3639	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3640	long j_max = min(jj+MAT_BLK_SZ, n);
	3641	unsigned long *panelp = &M[panel][0];
	3642
	3643	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3644	zz_p *xp = X[i].elts() + jj;
	3645
	3646	for (long j = jj; j < j_max; j++)
	3647	xp[j-jj].LoopHole() = rem(panelp[j-jj], p, red_struct);
	3648	}
	3649	}
	3650
	3651	d.LoopHole() = det;
	3652
	3653	}
	3654
	3655
	3656
	3657
	3658
	3659
	3660
	3661
	3662	static
	3663	void blk_inv_LL(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	3664	{
	3665	long n = A.NumRows();
	3666
	3667	if (A.NumCols() != n)
	3668	LogicError("inv: nonsquare matrix");
	3669
	3670	if (n == 0) {
	3671	set(d);
	3672	X.SetDims(0, 0);
	3673	return;
	3674	}
	3675
	3676	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too big");
	3677
	3678	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	3679
	3680	Vec< UniqueArray<long> > M;
	3681	M.SetLength(npanels);
	3682	for (long panel = 0; panel < npanels; panel++) {
	3683	M[panel].SetLength(n*MAT_BLK_SZ);
	3684	long *panelp = &M[panel][0];
	3685
	3686	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	3687	}
	3688
	3689
	3690	// copy A into panels
	3691	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3692	long j_max = min(jj+MAT_BLK_SZ, n);
	3693	long *panelp = &M[panel][0];
	3694
	3695	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3696	const zz_p *ap = A[i].elts() + jj;
	3697
	3698	for (long j = jj; j < j_max; j++)
	3699	panelp[j-jj] = rep(ap[j-jj]);
	3700	}
	3701	}
	3702
	3703	Vec<long> P;
	3704	P.SetLength(n);
	3705	for (long k = 0; k < n; k++) P[k] = k;
	3706	// records swap operations
	3707
	3708
	3709	long det;
	3710	det = 1;
	3711
	3712	long p = zz_p::modulus();
	3713	mulmod_t pinv = zz_p::ModulusInverse();
	3714	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	3715
	3716
	3717	bool seq = double(n)double(n)double(MAT_BLK_SZ) < PAR_THRESH;
	3718
	3719	bool pivoting = false;
	3720
	3721	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	3722	long k_max = min(kk+MAT_BLK_SZ, n);
	3723
	3724	long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	3725
	3726
	3727	for (long k = kk; k < k_max; k++) {
	3728
	3729	long pos = -1;
	3730	long pivot;
	3731	long pivot_inv;
	3732
	3733	for (long i = k; i < n; i++) {
	3734	// NOTE: by using InvModStatus, this code will work
	3735	// for prime-powers as well as primes
	3736	pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
	3737	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	3738	pos = i;
	3739	break;
	3740	}
	3741	}
	3742
	3743	if (pos == -1) {
	3744	clear(d);
	3745	return;
	3746	}
	3747
	3748	long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	3749	if (k != pos) {
	3750	// swap rows pos and k
	3751	long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	3752	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	3753
	3754	det = NegateMod(det, p);
	3755	P[k] = pos;
	3756	pivoting = true;
	3757	}
	3758
	3759	det = MulMod(det, pivot, p);
	3760
	3761	{
	3762	// multiply row k by pivot_inv
	3763	long t1 = pivot_inv;
	3764	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	3765	for (long j = 0; j < MAT_BLK_SZ; j++) {
	3766	y[j] = MulModPrecon(y[j], t1, p, t1pinv);
	3767	}
	3768
	3769	y[k-kk] = pivot_inv;
	3770	}
	3771
	3772	for (long i = 0; i < n; i++) {
	3773	if (i == k) continue; // skip row k
	3774
	3775	long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	3776	long t1 = x[k-kk];
	3777	t1 = NegateMod(t1, p);
	3778	x[k-kk] = 0;
	3779	if (t1 == 0) continue;
	3780
	3781	// add t1 * row k to row i
	3782	long ut1 = t1;
	3783	muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
	3784	}
	3785	}
	3786
	3787
	3788	// finished processing current kpanel
	3789	// next, reduce and apply to all other kpanels
	3790
	3791	// special processing: subtract 1 off of diangonal
	3792
	3793	for (long k = kk; k < k_max; k++)
	3794	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod(kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3795
	3796
	3797	NTL_GEXEC_RANGE(seq, npanels, first, last)
	3798	NTL_IMPORT(p)
	3799	NTL_IMPORT(n)
	3800	NTL_IMPORT(ll_red_struct)
	3801	NTL_IMPORT(kpanel)
	3802	NTL_IMPORT(kpanelp)
	3803	NTL_IMPORT(kk)
	3804	NTL_IMPORT(k_max)
	3805
	3806
	3807	UniqueArray<long> buf_store;
	3808	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	3809	long *buf = &buf_store[0];
	3810
	3811	for (long jpanel = first; jpanel < last; jpanel++) {
	3812	if (jpanel == kpanel) continue;
	3813
	3814	long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	3815
	3816	// perform swaps
	3817	for (long k = kk; k < k_max; k++) {
	3818	long pos = P[k];
	3819	if (pos != k) {
	3820	// swap rows pos and k
	3821	long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	3822	long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	3823	for (long j = 0; j < MAT_BLK_SZ; j++)
	3824	_ntl_swap(pos_p[j], k_p[j]);
	3825	}
	3826	}
	3827
	3828	// copy block number kpanel (the one on the diagonal) into buf
	3829	// here, we transpose it
	3830
	3831	for (long k = kk; k < k_max; k++)
	3832	for (long j = 0; j < MAT_BLK_SZ; j++)
	3833	buf[j*MAT_BLK_SZ + (k-kk)] =
	3834	jpanelp[k*MAT_BLK_SZ+j];
	3835
	3836
	3837	// jpanel += kpanel*buf
	3838
	3839	muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
	3840	}
	3841
	3842	NTL_GEXEC_RANGE_END
	3843
	3844	// special processing: add 1 back to the diangonal
	3845
	3846	for (long k = kk; k < k_max; k++)
	3847	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod(kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	3848
	3849	}
	3850
	3851	if (pivoting) {
	3852	// pivot colums, using reverse swap sequence
	3853
	3854	for (long k = n-1; k >= 0; k--) {
	3855	long pos = P[k];
	3856	if (pos != k) {
	3857	// swap columns pos and k
	3858
	3859	long * NTL_RESTRICT x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
	3860	long * NTL_RESTRICT y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
	3861	for (long i = 0; i < n; i++) {
	3862	_ntl_swap(x[iMAT_BLK_SZ], y[iMAT_BLK_SZ]);
	3863	}
	3864	}
	3865	}
	3866	}
	3867
	3868	// copy panels into X
	3869	X.SetDims(n, n);
	3870	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	3871	long j_max = min(jj+MAT_BLK_SZ, n);
	3872	long *panelp = &M[panel][0];
	3873
	3874	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	3875	zz_p *xp = X[i].elts() + jj;
	3876
	3877	for (long j = jj; j < j_max; j++)
	3878	xp[j-jj].LoopHole() = panelp[j-jj];
	3879	}
	3880	}
	3881
	3882	d.LoopHole() = det;
	3883
	3884	}
	3885
	3886
	3887
	3888	#endif
	3889
	3890
	3891
	3892	void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
	3893	{
	3894	long n = A.NumRows();
	3895
	3896	if (A.NumCols() != n)
	3897	LogicError("inv: nonsquare matrix");
	3898
	3899	#ifndef NTL_HAVE_LL_TYPE
	3900
	3901	basic_inv(d, X, A, relax);
	3902
	3903	#else
	3904
	3905	long p = zz_p::modulus();
	3906
	3907	if (n < 16) {
	3908	//cerr << "basic_inv\n";
	3909	basic_inv(d, X, A, relax);
	3910	}
	3911	else if (n/MAT_BLK_SZ < 4) {
	3912	long V = 64;
	3913
	3914	#ifdef NTL_HAVE_AVX
	3915	if (p-1 <= MAX_DBL_INT &&
	3916	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	3917	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	3918
	3919	//cerr << "alt_inv_DD\n";
	3920	alt_inv_DD(d, X, A, relax);
	3921	}
	3922	else
	3923	#endif
	3924	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	3925	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	3926
	3927	//cerr << "alt_inv_L\n";
	3928	alt_inv_L(d, X, A, relax);
	3929
	3930	}
	3931	else {
	3932
	3933	//cerr << "basic_inv\n";
	3934	basic_inv(d, X, A, relax);
	3935	}
	3936	}
	3937	else {
	3938	long V = 4*MAT_BLK_SZ;
	3939
	3940	#ifdef NTL_HAVE_AVX
	3941	if (p-1 <= MAX_DBL_INT &&
	3942	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	3943	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	3944
	3945	//cerr << "blk_inv_DD\n";
	3946	blk_inv_DD(d, X, A, relax);
	3947	}
	3948	else
	3949	#endif
	3950	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	3951	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	3952
	3953	//cerr << "blk_inv_L\n";
	3954	blk_inv_L(d, X, A, relax);
	3955
	3956	}
	3957	else {
	3958
	3959	//cerr << "blk_inv_LL\n";
	3960	blk_inv_LL(d, X, A, relax);
	3961	}
	3962
	3963	}
	3964
	3965	#endif
	3966
	3967
	3968
	3969	}
	3970
	3971
	3972
	3973	// ******************************************************************
	3974	//
	3975	// Triangularizing square matrices, with applications
	3976	// to solving linear systems and computing determinants.
	3977	// Should be about 3x faster than the matrix inverse
	3978	// algorithms.
	3979	//
	3980	// ******************************************************************
	3981
	3982
	3983	static
	3984	void basic_tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	3985	vec_zz_p *xp, bool trans, bool relax)
	3986	{
	3987	long n = A.NumRows();
	3988
	3989	// adjust
	3990	if (A.NumCols() != n)
	3991	LogicError("tri: nonsquare matrix");
	3992
	3993	// adjust
	3994	if (bp && bp->length() != n)
	3995	LogicError("tri: dimension mismatch");
	3996
	3997	// adjust
	3998	if (bp && !xp)
	3999	LogicError("tri: bad args");
	4000
	4001	if (n == 0) {
	4002	set(d);
	4003	// adjust
	4004	if (xp) xp->SetLength(0);
	4005	return;
	4006	}
	4007
	4008	// adjust (several lines)
	4009	// scratch space
	4010	Mat<long> M;
	4011	if (!trans) {
	4012	conv(M, A);
	4013	}
	4014	else {
	4015	M.SetDims(n, n);
	4016	for (long i = 0; i < n; i++)
	4017	for (long j = 0; j < n; j++)
	4018	M[i][j] = rep(A[j][i]);
	4019	}
	4020
	4021	Vec<long> bv;
	4022	if (bp) conv(bv, *bp);
	4023	// end adjust
	4024
	4025
	4026	Vec<long> P;
	4027	P.SetLength(n);
	4028	for (long k = 0; k < n; k++) P[k] = k;
	4029	// records swap operations
	4030
	4031	long det;
	4032	det = 1;
	4033
	4034	long p = zz_p::modulus();
	4035	mulmod_t pinv = zz_p::ModulusInverse();
	4036
	4037
	4038	bool pivoting = false;
	4039
	4040	for (long k = 0; k < n; k++) {
	4041	long pos = -1;
	4042	long pivot_inv;
	4043	for (long i = k; i < n; i++) {
	4044	// NOTE: by using InvModStatus, this code will work
	4045	// for prime-powers as well as primes
	4046	long pivot = M[i][k];
	4047	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	4048	pos = i;
	4049	break;
	4050	}
	4051	}
	4052
	4053	if (pos != -1) {
	4054	if (k != pos) {
	4055	swap(M[pos], M[k]);
	4056	det = NegateMod(det, p);
	4057	P[k] = pos;
	4058	pivoting = true;
	4059
	4060	// adjust
	4061	if (bp) _ntl_swap(bv[pos], bv[k]);
	4062	}
	4063
	4064	det = MulMod(det, M[k][k], p);
	4065
	4066	{
	4067	// multiply row k by pivot_inv
	4068	long t1 = pivot_inv;
	4069	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	4070	long * NTL_RESTRICT y = &M[k][0];
	4071	// adjust
	4072	for (long j = k+1; j < n; j++)
	4073	y[j] = MulModPrecon(y[j], t1, p, t1pinv);
	4074
	4075	// adjust // y[k] = pivot_inv;
	4076
	4077	// adjust
	4078	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	4079	}
	4080
	4081
	4082
	4083	// adjust
	4084	bool seq = n-(k+1) < PAR_THRESH_SQ;
	4085	NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
	4086	NTL_IMPORT(p)
	4087	NTL_IMPORT(n)
	4088	NTL_IMPORT(k)
	4089	long * NTL_RESTRICT y = &M[k][0];
	4090
	4091	// adjust
	4092	for (long ii = first; ii < last; ii++) {
	4093	long i = ii + k+1;
	4094
	4095	long * NTL_RESTRICT x = &M[i][0];
	4096	long t1 = x[k];
	4097	t1 = NegateMod(t1, p);
	4098	// adjust // x[k] = 0;
	4099	if (t1 == 0) continue;
	4100
	4101	// add t1 * row k to row i
	4102	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	4103
	4104	// adjust
	4105	for (long j = k+1; j < n; j++) {
	4106	long t2 = MulModPrecon(y[j], t1, p, t1pinv);
	4107	x[j] = AddMod(x[j], t2, p);
	4108	}
	4109
	4110	// adjust
	4111	if (bp)
	4112	{
	4113	long t2 = MulModPrecon(bv[k], t1, p, t1pinv);
	4114	bv[i] = AddMod(bv[i], t2, p);
	4115	}
	4116	}
	4117	NTL_GEXEC_RANGE_END
	4118	}
	4119	else {
	4120	clear(d);
	4121	return;
	4122	}
	4123	}
	4124
	4125
	4126	// adjust
	4127	if (bp) {
	4128	xp->SetLength(n);
	4129	zz_p *X = xp->elts();
	4130
	4131	for (long i = n-1; i >= 0; i--) {
	4132	long t1 = 0;
	4133	for (long j = i+1; j < n; j++) {
	4134	long t2 = MulMod(rep(X[j]), M[i][j], p);
	4135	t1 = AddMod(t1, t2, p);
	4136	}
	4137	X[i].LoopHole() = SubMod(bv[i], t1, p);
	4138	}
	4139	}
	4140
	4141	d.LoopHole() = det;
	4142	}
	4143
	4144
	4145
	4146
	4147	#ifdef NTL_HAVE_LL_TYPE
	4148
	4149
	4150
	4151	static
	4152	void alt_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	4153	vec_zz_p *xp, bool trans, bool relax)
	4154	{
	4155	long n = A.NumRows();
	4156
	4157	if (A.NumCols() != n)
	4158	LogicError("tri: nonsquare matrix");
	4159
	4160	// adjust
	4161	if (bp && bp->length() != n)
	4162	LogicError("tri: dimension mismatch");
	4163
	4164	// adjust
	4165	if (bp && !xp)
	4166	LogicError("tri: bad args");
	4167
	4168	if (n == 0) {
	4169	set(d);
	4170	if (xp) xp->SetLength(0);
	4171	return;
	4172	}
	4173
	4174
	4175	// scratch space
	4176	Mat<unsigned long> M;
	4177	if (!trans) {
	4178	conv(M, A);
	4179	}
	4180	else {
	4181	M.SetDims(n, n);
	4182	for (long i = 0; i < n; i++)
	4183	for (long j = 0; j < n; j++)
	4184	M[i][j] = rep(A[j][i]);
	4185	}
	4186
	4187	Vec<long> bv;
	4188	if (bp) conv(bv, *bp);
	4189
	4190	Vec<long> P;
	4191	P.SetLength(n);
	4192	for (long k = 0; k < n; k++) P[k] = k;
	4193	// records swap operations
	4194
	4195	long det;
	4196	det = 1;
	4197
	4198	long p = zz_p::modulus();
	4199	mulmod_t pinv = zz_p::ModulusInverse();
	4200	sp_reduce_struct red_struct = zz_p::red_struct();
	4201
	4202
	4203
	4204	bool pivoting = false;
	4205
	4206	unsigned long ured_trigger =
	4207	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	4208	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	4209
	4210	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	4211
	4212	long red_count = red_trigger;
	4213
	4214
	4215	for (long k = 0; k < n; k++) {
	4216	bool cleanup = false;
	4217
	4218	if (red_count-1 < 0) {
	4219	red_count = red_trigger;
	4220	cleanup = true;
	4221	}
	4222
	4223	red_count = red_count-1;
	4224
	4225	long pos = -1;
	4226	long pivot;
	4227	long pivot_inv;
	4228
	4229	for (long i = k; i < n; i++) {
	4230	// NOTE: by using InvModStatus, this code will work
	4231	// for prime-powers as well as primes
	4232	pivot = rem(M[i][k], p, red_struct);
	4233	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	4234	pos = i;
	4235	break;
	4236	}
	4237	}
	4238
	4239	if (pos != -1) {
	4240	if (k != pos) {
	4241	swap(M[pos], M[k]);
	4242	det = NegateMod(det, p);
	4243	P[k] = pos;
	4244	pivoting = true;
	4245
	4246	if (bp) _ntl_swap(bv[pos], bv[k]);
	4247	}
	4248
	4249	det = MulMod(det, pivot, p);
	4250
	4251	{
	4252	// multiply row k by pivot_inv
	4253	long t1 = pivot_inv;
	4254	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
	4255	unsigned long * NTL_RESTRICT y = &M[k][0];
	4256	for (long j = k+1; j < n; j++) {
	4257	long t2 = rem(y[j], p, red_struct);
	4258	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	4259	}
	4260
	4261	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	4262	}
	4263
	4264
	4265
	4266	bool seq = n-(k+1) < PAR_THRESH_SQ;
	4267	NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
	4268	NTL_IMPORT(p)
	4269	NTL_IMPORT(n)
	4270	NTL_IMPORT(k)
	4271	NTL_IMPORT(red_struct)
	4272	unsigned long * NTL_RESTRICT y = &M[k][0];
	4273	if (cleanup) {
	4274	for (long ii = first; ii < last; ii++) {
	4275	long i = ii + k+1;
	4276
	4277	unsigned long * NTL_RESTRICT x = &M[i][0];
	4278	for (long j = k+1; j < n; j++) {
	4279	x[j] = rem(x[j], p, red_struct);
	4280	}
	4281	}
	4282	}
	4283
	4284
	4285	for (long ii = first; ii < last; ii++) {
	4286	long i = ii + k+1;
	4287
	4288	unsigned long * NTL_RESTRICT x = &M[i][0];
	4289	long t1 = rem(x[k], p, red_struct);
	4290	t1 = NegateMod(t1, p);
	4291	if (t1 == 0) continue;
	4292
	4293	// add t1 * row k to row i
	4294	unsigned long ut1 = t1;
	4295	long j;
	4296	for (j = k+1; j <= n-4; j+=4) {
	4297	unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
	4298	unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
	4299	unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
	4300	unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
	4301	x[j+0] = xj0;
	4302	x[j+1] = xj1;
	4303	x[j+2] = xj2;
	4304	x[j+3] = xj3;
	4305	}
	4306	for (; j < n; j++) {
	4307	x[j] += DO_MUL(y[j], ut1);
	4308	}
	4309
	4310	if (bp)
	4311	{
	4312	long t2 = MulMod(bv[k], t1, p);
	4313	bv[i] = AddMod(bv[i], t2, p);
	4314	}
	4315	}
	4316	NTL_GEXEC_RANGE_END
	4317	}
	4318	else {
	4319	clear(d);
	4320	return;
	4321	}
	4322	}
	4323
	4324
	4325
	4326	if (bp) {
	4327	xp->SetLength(n);
	4328	zz_p *X = xp->elts();
	4329
	4330	for (long i = n-1; i >= 0; i--) {
	4331	long t1 = 0;
	4332	for (long j = i+1; j < n; j++) {
	4333	long t0 = rem(M[i][j], p, red_struct);
	4334	long t2 = MulMod(rep(X[j]), t0, p);
	4335	t1 = AddMod(t1, t2, p);
	4336	}
	4337	X[i].LoopHole() = SubMod(bv[i], t1, p);
	4338	}
	4339	}
	4340
	4341	d.LoopHole() = det;
	4342	}
	4343
	4344
	4345
	4346
	4347	#ifdef NTL_HAVE_AVX
	4348
	4349	static
	4350	void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	4351	vec_zz_p *xp, bool trans, bool relax)
	4352	{
	4353	long n = A.NumRows();
	4354
	4355	if (A.NumCols() != n)
	4356	LogicError("tri: nonsquare matrix");
	4357
	4358	// adjust
	4359	if (bp && bp->length() != n)
	4360	LogicError("tri: dimension mismatch");
	4361
	4362	// adjust
	4363	if (bp && !xp)
	4364	LogicError("tri: bad args");
	4365
	4366	if (n == 0) {
	4367	set(d);
	4368	if (xp) xp->SetLength(0);
	4369	return;
	4370	}
	4371
	4372
	4373	// scratch space
	4374
	4375	Vec< AlignedArray<double> > M;
	4376	M.SetLength(n);
	4377	for (long i = 0; i < n; i++) M[i].SetLength(n);
	4378	if (!trans) {
	4379	for (long i = 0; i < n; i++)
	4380	for (long j = 0; j < n; j++)
	4381	M[i][j] = rep(A[i][j]);
	4382	}
	4383	else {
	4384	for (long i = 0; i < n; i++)
	4385	for (long j = 0; j < n; j++)
	4386	M[i][j] = rep(A[j][i]);
	4387	}
	4388
	4389	Vec<long> bv;
	4390	if (bp) conv(bv, *bp);
	4391
	4392	Vec<long> P;
	4393	P.SetLength(n);
	4394	for (long k = 0; k < n; k++) P[k] = k;
	4395	// records swap operations
	4396
	4397	long det;
	4398	det = 1;
	4399
	4400	long p = zz_p::modulus();
	4401	mulmod_t pinv = zz_p::ModulusInverse();
	4402	sp_reduce_struct red_struct = zz_p::red_struct();
	4403
	4404
	4405
	4406	bool pivoting = false;
	4407
	4408	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	4409	long red_count = red_trigger;
	4410
	4411	for (long k = 0; k < n; k++) {
	4412	bool cleanup = false;
	4413
	4414	if (red_count-1 < 0) {
	4415	red_count = red_trigger;
	4416	cleanup = true;
	4417	}
	4418
	4419	red_count = red_count-1;
	4420
	4421	long pos = -1;
	4422	long pivot;
	4423	long pivot_inv;
	4424
	4425	for (long i = k; i < n; i++) {
	4426	// NOTE: by using InvModStatus, this code will work
	4427	// for prime-powers as well as primes
	4428	pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
	4429	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	4430	pos = i;
	4431	break;
	4432	}
	4433	}
	4434
	4435	if (pos != -1) {
	4436	if (k != pos) {
	4437	swap(M[pos], M[k]);
	4438	det = NegateMod(det, p);
	4439	P[k] = pos;
	4440	pivoting = true;
	4441
	4442	if (bp) _ntl_swap(bv[pos], bv[k]);
	4443	}
	4444
	4445	det = MulMod(det, pivot, p);
	4446
	4447	{
	4448	// multiply row k by pivot_inv
	4449	long t1 = pivot_inv;
	4450	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
	4451	double * NTL_RESTRICT y = &M[k][0];
	4452	for (long j = k+1; j < n; j++) {
	4453	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
	4454	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	4455	}
	4456
	4457	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	4458	}
	4459
	4460
	4461
	4462	bool seq = n-(k+1) < PAR_THRESH_SQ;
	4463	NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
	4464	NTL_IMPORT(p)
	4465	NTL_IMPORT(n)
	4466	NTL_IMPORT(k)
	4467	NTL_IMPORT(red_struct)
	4468	double * NTL_RESTRICT y = &M[k][0];
	4469	if (cleanup) {
	4470	for (long ii = first; ii < last; ii++) {
	4471	long i = ii + k+1;
	4472
	4473	double * NTL_RESTRICT x = &M[i][0];
	4474	for (long j = k+1; j < n; j++) {
	4475	x[j] = rem((unsigned long)(long)x[j], p, red_struct);
	4476	}
	4477	}
	4478	}
	4479
	4480	long align_boundary =
	4481	min((((k+1)+(NTL_AVX_DBL_ALIGN-1))/NTL_AVX_DBL_ALIGN)*NTL_AVX_DBL_ALIGN, n);
	4482
	4483
	4484	for (long ii = first; ii < last; ii++) {
	4485	long i = ii + k+1;
	4486
	4487	double * NTL_RESTRICT x = &M[i][0];
	4488	long t1 = rem((unsigned long)(long)x[k], p, red_struct);
	4489	t1 = NegateMod(t1, p);
	4490	if (t1 == 0) continue;
	4491
	4492	// add t1 * row k to row i
	4493	double ut1 = t1;
	4494	for (long j = k+1; j < align_boundary; j++) x[j] += y[j]*ut1;
	4495	muladd_interval1(x+align_boundary, y+align_boundary, ut1, n-align_boundary);
	4496
	4497	if (bp)
	4498	{
	4499	long t2 = MulMod(bv[k], t1, p);
	4500	bv[i] = AddMod(bv[i], t2, p);
	4501	}
	4502	}
	4503	NTL_GEXEC_RANGE_END
	4504	}
	4505	else {
	4506	clear(d);
	4507	return;
	4508	}
	4509	}
	4510
	4511
	4512
	4513	if (bp) {
	4514	xp->SetLength(n);
	4515	zz_p *X = xp->elts();
	4516
	4517	for (long i = n-1; i >= 0; i--) {
	4518	long t1 = 0;
	4519	for (long j = i+1; j < n; j++) {
	4520	long t0 = rem((unsigned long)(long)M[i][j], p, red_struct);
	4521	long t2 = MulMod(rep(X[j]), t0, p);
	4522	t1 = AddMod(t1, t2, p);
	4523	}
	4524	X[i].LoopHole() = SubMod(bv[i], t1, p);
	4525	}
	4526	}
	4527
	4528	d.LoopHole() = det;
	4529	}
	4530
	4531
	4532	#endif
	4533
	4534
	4535
	4536
	4537	#ifdef NTL_HAVE_AVX
	4538
	4539	static
	4540	void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	4541	vec_zz_p *xp, bool trans, bool relax)
	4542	{
	4543	long n = A.NumRows();
	4544
	4545	if (A.NumCols() != n)
	4546	LogicError("tri: nonsquare matrix");
	4547
	4548	if (bp && bp->length() != n)
	4549	LogicError("tri: dimension mismatch");
	4550
	4551	if (bp && !xp)
	4552	LogicError("tri: bad args");
	4553
	4554	if (n == 0) {
	4555	set(d);
	4556	if (xp) xp->SetLength(0);
	4557	return;
	4558	}
	4559
	4560	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	4561
	4562	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	4563
	4564	Vec< AlignedArray<double> > M;
	4565	M.SetLength(npanels);
	4566	for (long panel = 0; panel < npanels; panel++) {
	4567	M[panel].SetLength(n*MAT_BLK_SZ);
	4568	double *panelp = &M[panel][0];
	4569
	4570	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	4571	}
	4572
	4573	if (trans) {
	4574	// copy A transposed into panels
	4575	for (long i = 0; i < n; i++) {
	4576	const zz_p *row = &A[i][0];
	4577	double *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
	4578	for (long j = 0; j < n; j++)
	4579	col[j*MAT_BLK_SZ] = rep(row[j]);
	4580	}
	4581	}
	4582	else {
	4583	// copy A into panels
	4584	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	4585	long j_max = min(jj+MAT_BLK_SZ, n);
	4586	double *panelp = &M[panel][0];
	4587
	4588	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	4589	const zz_p *ap = A[i].elts() + jj;
	4590
	4591	for (long j = jj; j < j_max; j++)
	4592	panelp[j-jj] = rep(ap[j-jj]);
	4593	}
	4594	}
	4595	}
	4596
	4597	Vec<long> bv;
	4598	if (bp) conv(bv, *bp);
	4599
	4600	Vec<long> P;
	4601	P.SetLength(n);
	4602	for (long k = 0; k < n; k++) P[k] = k;
	4603	// records swap operations
	4604
	4605
	4606	long det;
	4607	det = 1;
	4608
	4609	long p = zz_p::modulus();
	4610	mulmod_t pinv = zz_p::ModulusInverse();
	4611	sp_reduce_struct red_struct = zz_p::red_struct();
	4612
	4613
	4614	bool pivoting = false;
	4615
	4616	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	4617	long red_count = red_trigger;
	4618
	4619	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	4620	long k_max = min(kk+MAT_BLK_SZ, n);
	4621
	4622	bool cleanup = false;
	4623
	4624	if (red_count-MAT_BLK_SZ < 0) {
	4625	red_count = red_trigger;
	4626	cleanup = true;
	4627	}
	4628
	4629	red_count = red_count-MAT_BLK_SZ;
	4630	double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	4631
	4632	if (cleanup) {
	4633	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	4634	kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
	4635	}
	4636
	4637	for (long k = kk; k < k_max; k++) {
	4638
	4639	long pos = -1;
	4640	long pivot;
	4641	long pivot_inv;
	4642
	4643	for (long i = k; i < n; i++) {
	4644	// NOTE: by using InvModStatus, this code will work
	4645	// for prime-powers as well as primes
	4646	pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	4647	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	4648	pos = i;
	4649	break;
	4650	}
	4651	}
	4652
	4653	if (pos == -1) {
	4654	clear(d);
	4655	return;
	4656	}
	4657
	4658	double * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	4659	if (k != pos) {
	4660	// swap rows pos and k
	4661	double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	4662	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	4663
	4664	det = NegateMod(det, p);
	4665	P[k] = pos;
	4666	pivoting = true;
	4667
	4668	if (bp) _ntl_swap(bv[pos], bv[k]);
	4669	}
	4670
	4671	det = MulMod(det, pivot, p);
	4672
	4673	{
	4674	// multiply row k by pivot_inv
	4675	long t1 = pivot_inv;
	4676	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	4677	for (long j = 0; j < MAT_BLK_SZ; j++) {
	4678	long t2 = rem((unsigned long)(long)y[j], p, red_struct);
	4679	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	4680	}
	4681
	4682	y[k-kk] = pivot_inv;
	4683
	4684	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	4685	}
	4686
	4687	for (long i = kk; i < n; i++) {
	4688	if (i == k) continue; // skip row k
	4689
	4690	double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	4691	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
	4692	t1 = NegateMod(t1, p);
	4693	x[k-kk] = 0;
	4694	if (t1 == 0) continue;
	4695
	4696	// add t1 * row k to row i
	4697	double ut1 = t1;
	4698	muladd_interval(x, y, ut1, MAT_BLK_SZ);
	4699	if (bp)
	4700	{
	4701	long t2 = MulMod(bv[k], t1, p);
	4702	bv[i] = AddMod(bv[i], t2, p);
	4703	}
	4704	}
	4705	}
	4706
	4707
	4708	// finished processing current kpanel
	4709	// next, reduce and apply to all other kpanels
	4710
	4711	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	4712	kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
	4713
	4714	// special processing: subtract 1 off of diangonal
	4715
	4716	for (long k = kk; k < k_max; k++)
	4717	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	4718
	4719
	4720	bool seq = double(npanels-(kpanel+1))double(n)double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
	4721
	4722	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	4723	NTL_IMPORT(p)
	4724	NTL_IMPORT(n)
	4725	NTL_IMPORT(red_struct)
	4726	NTL_IMPORT(kpanel)
	4727	NTL_IMPORT(kpanelp)
	4728	NTL_IMPORT(kk)
	4729	NTL_IMPORT(k_max)
	4730
	4731
	4732	AlignedArray<double> buf_store;
	4733	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	4734	double *buf = &buf_store[0];
	4735
	4736	for (long index = first; index < last; index++) {
	4737	long jpanel = index + kpanel+1;
	4738
	4739	double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	4740
	4741	if (cleanup) {
	4742	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	4743	jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
	4744	}
	4745
	4746	// perform swaps
	4747	for (long k = kk; k < k_max; k++) {
	4748	long pos = P[k];
	4749	if (pos != k) {
	4750	// swap rows pos and k
	4751	double * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	4752	double * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	4753	for (long j = 0; j < MAT_BLK_SZ; j++)
	4754	_ntl_swap(pos_p[j], k_p[j]);
	4755	}
	4756	}
	4757
	4758	// copy block number kpanel (the one on the diagonal) into buf
	4759
	4760	for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
	4761	buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
	4762
	4763	// jpanel += kpanel*buf
	4764
	4765	muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
	4766	}
	4767
	4768	NTL_GEXEC_RANGE_END
	4769
	4770	// special processing: add 1 back to the diangonal
	4771
	4772	for (long k = kk; k < k_max; k++)
	4773	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	4774
	4775	}
	4776
	4777	if (bp) {
	4778	xp->SetLength(n);
	4779	zz_p *X = xp->elts();
	4780
	4781	for (long i = n-1; i >= 0; i--) {
	4782	long t1 = 0;
	4783	long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	4784	for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
	4785	jj < n; jj += MAT_BLK_SZ, panel++) {
	4786	long j_max = min(jj+MAT_BLK_SZ, n);
	4787	double row = &M[panel][iMAT_BLK_SZ];
	4788	for (long j = jj; j < j_max; j++) {
	4789	long t0 = rem((unsigned long)(long)row[j-jj], p, red_struct);
	4790	long t2 = MulMod(rep(X[j]), t0, p);
	4791	t1 = AddMod(t1, t2, p);
	4792	}
	4793	}
	4794	X[i].LoopHole() = SubMod(bv[i], t1, p);
	4795	}
	4796	}
	4797
	4798	d.LoopHole() = det;
	4799
	4800	}
	4801
	4802	#endif
	4803
	4804
	4805	static
	4806	void blk_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	4807	vec_zz_p *xp, bool trans, bool relax)
	4808	{
	4809	long n = A.NumRows();
	4810
	4811	if (A.NumCols() != n)
	4812	LogicError("tri: nonsquare matrix");
	4813
	4814	if (bp && bp->length() != n)
	4815	LogicError("tri: dimension mismatch");
	4816
	4817	if (bp && !xp)
	4818	LogicError("tri: bad args");
	4819
	4820	if (n == 0) {
	4821	set(d);
	4822	if (xp) xp->SetLength(0);
	4823	return;
	4824	}
	4825
	4826	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	4827
	4828	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	4829
	4830	Vec< UniqueArray<unsigned long> > M;
	4831	M.SetLength(npanels);
	4832	for (long panel = 0; panel < npanels; panel++) {
	4833	M[panel].SetLength(n*MAT_BLK_SZ);
	4834	unsigned long *panelp = &M[panel][0];
	4835
	4836	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	4837	}
	4838
	4839	if (trans) {
	4840	// copy A transposed into panels
	4841	for (long i = 0; i < n; i++) {
	4842	const zz_p *row = &A[i][0];
	4843	unsigned long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
	4844	for (long j = 0; j < n; j++)
	4845	col[j*MAT_BLK_SZ] = rep(row[j]);
	4846	}
	4847	}
	4848	else {
	4849	// copy A into panels
	4850	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	4851	long j_max = min(jj+MAT_BLK_SZ, n);
	4852	unsigned long *panelp = &M[panel][0];
	4853
	4854	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	4855	const zz_p *ap = A[i].elts() + jj;
	4856
	4857	for (long j = jj; j < j_max; j++)
	4858	panelp[j-jj] = rep(ap[j-jj]);
	4859	}
	4860	}
	4861	}
	4862
	4863	Vec<long> bv;
	4864	if (bp) conv(bv, *bp);
	4865
	4866	Vec<long> P;
	4867	P.SetLength(n);
	4868	for (long k = 0; k < n; k++) P[k] = k;
	4869	// records swap operations
	4870
	4871
	4872	long det;
	4873	det = 1;
	4874
	4875	long p = zz_p::modulus();
	4876	mulmod_t pinv = zz_p::ModulusInverse();
	4877	sp_reduce_struct red_struct = zz_p::red_struct();
	4878
	4879
	4880	bool pivoting = false;
	4881
	4882	unsigned long ured_trigger =
	4883	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	4884	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	4885
	4886	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	4887
	4888	long red_count = red_trigger;
	4889
	4890	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	4891	long k_max = min(kk+MAT_BLK_SZ, n);
	4892
	4893	bool cleanup = false;
	4894
	4895	if (red_count-MAT_BLK_SZ < 0) {
	4896	red_count = red_trigger;
	4897	cleanup = true;
	4898	}
	4899
	4900	red_count = red_count-MAT_BLK_SZ;
	4901	unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	4902
	4903	if (cleanup) {
	4904	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	4905	kpanelp[r] = rem(kpanelp[r], p, red_struct);
	4906	}
	4907
	4908	for (long k = kk; k < k_max; k++) {
	4909
	4910	long pos = -1;
	4911	long pivot;
	4912	long pivot_inv;
	4913
	4914	for (long i = k; i < n; i++) {
	4915	// NOTE: by using InvModStatus, this code will work
	4916	// for prime-powers as well as primes
	4917	pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	4918	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	4919	pos = i;
	4920	break;
	4921	}
	4922	}
	4923
	4924	if (pos == -1) {
	4925	clear(d);
	4926	return;
	4927	}
	4928
	4929	unsigned long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	4930	if (k != pos) {
	4931	// swap rows pos and k
	4932	unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	4933	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	4934
	4935	det = NegateMod(det, p);
	4936	P[k] = pos;
	4937	pivoting = true;
	4938
	4939	if (bp) _ntl_swap(bv[pos], bv[k]);
	4940	}
	4941
	4942	det = MulMod(det, pivot, p);
	4943
	4944	{
	4945	// multiply row k by pivot_inv
	4946	long t1 = pivot_inv;
	4947	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	4948	for (long j = 0; j < MAT_BLK_SZ; j++) {
	4949	long t2 = rem(y[j], p, red_struct);
	4950	y[j] = MulModPrecon(t2, t1, p, t1pinv);
	4951	}
	4952
	4953	y[k-kk] = pivot_inv;
	4954
	4955	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	4956	}
	4957
	4958	for (long i = kk; i < n; i++) {
	4959	if (i == k) continue; // skip row k
	4960
	4961	unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	4962	long t1 = rem(x[k-kk], p, red_struct);
	4963	t1 = NegateMod(t1, p);
	4964	x[k-kk] = 0;
	4965	if (t1 == 0) continue;
	4966
	4967	// add t1 * row k to row i
	4968	unsigned long ut1 = t1;
	4969	muladd_interval(x, y, ut1, MAT_BLK_SZ);
	4970	if (bp)
	4971	{
	4972	long t2 = MulMod(bv[k], t1, p);
	4973	bv[i] = AddMod(bv[i], t2, p);
	4974	}
	4975	}
	4976	}
	4977
	4978
	4979	// finished processing current kpanel
	4980	// next, reduce and apply to all other kpanels
	4981
	4982	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	4983	kpanelp[r] = rem(kpanelp[r], p, red_struct);
	4984
	4985	// special processing: subtract 1 off of diangonal
	4986
	4987	for (long k = kk; k < k_max; k++)
	4988	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	4989
	4990
	4991	bool seq = double(npanels-(kpanel+1))double(n)double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
	4992	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	4993	NTL_IMPORT(p)
	4994	NTL_IMPORT(n)
	4995	NTL_IMPORT(red_struct)
	4996	NTL_IMPORT(kpanel)
	4997	NTL_IMPORT(kpanelp)
	4998	NTL_IMPORT(kk)
	4999	NTL_IMPORT(k_max)
	5000
	5001
	5002	UniqueArray<unsigned long> buf_store;
	5003	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	5004	unsigned long *buf = &buf_store[0];
	5005
	5006	for (long index = first; index < last; index++) {
	5007	long jpanel = index + kpanel+1;
	5008
	5009	unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	5010
	5011	if (cleanup) {
	5012	for (long r = kkMAT_BLK_SZ; r < nMAT_BLK_SZ; r++)
	5013	jpanelp[r] = rem(jpanelp[r], p, red_struct);
	5014	}
	5015
	5016	// perform swaps
	5017	for (long k = kk; k < k_max; k++) {
	5018	long pos = P[k];
	5019	if (pos != k) {
	5020	// swap rows pos and k
	5021	unsigned long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	5022	unsigned long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	5023	for (long j = 0; j < MAT_BLK_SZ; j++)
	5024	_ntl_swap(pos_p[j], k_p[j]);
	5025	}
	5026	}
	5027
	5028	// copy block number kpanel (the one on the diagonal) into buf
	5029	// here, we transpose it
	5030
	5031	for (long k = kk; k < k_max; k++)
	5032	for (long j = 0; j < MAT_BLK_SZ; j++)
	5033	buf[j*MAT_BLK_SZ + (k-kk)] =
	5034	rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
	5035
	5036	// jpanel += kpanel*buf
	5037
	5038	muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
	5039	}
	5040
	5041	NTL_GEXEC_RANGE_END
	5042
	5043	// special processing: add 1 back to the diangonal
	5044
	5045	for (long k = kk; k < k_max; k++)
	5046	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	5047
	5048	}
	5049
	5050	if (bp) {
	5051	xp->SetLength(n);
	5052	zz_p *X = xp->elts();
	5053
	5054	for (long i = n-1; i >= 0; i--) {
	5055	long t1 = 0;
	5056	long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5057	for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
	5058	jj < n; jj += MAT_BLK_SZ, panel++) {
	5059	long j_max = min(jj+MAT_BLK_SZ, n);
	5060	unsigned long row = &M[panel][iMAT_BLK_SZ];
	5061	for (long j = jj; j < j_max; j++) {
	5062	long t0 = rem(row[j-jj], p, red_struct);
	5063	long t2 = MulMod(rep(X[j]), t0, p);
	5064	t1 = AddMod(t1, t2, p);
	5065	}
	5066	}
	5067	X[i].LoopHole() = SubMod(bv[i], t1, p);
	5068	}
	5069	}
	5070
	5071	d.LoopHole() = det;
	5072
	5073	}
	5074
	5075
	5076	static
	5077	void blk_tri_LL(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	5078	vec_zz_p *xp, bool trans, bool relax)
	5079	{
	5080	long n = A.NumRows();
	5081
	5082	if (A.NumCols() != n)
	5083	LogicError("tri: nonsquare matrix");
	5084
	5085	if (bp && bp->length() != n)
	5086	LogicError("tri: dimension mismatch");
	5087
	5088	if (bp && !xp)
	5089	LogicError("tri: bad args");
	5090
	5091	if (n == 0) {
	5092	set(d);
	5093	if (xp) xp->SetLength(0);
	5094	return;
	5095	}
	5096
	5097	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	5098
	5099	long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5100
	5101	Vec< UniqueArray<long> > M;
	5102	M.SetLength(npanels);
	5103	for (long panel = 0; panel < npanels; panel++) {
	5104	M[panel].SetLength(n*MAT_BLK_SZ);
	5105	long *panelp = &M[panel][0];
	5106
	5107	for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
	5108	}
	5109
	5110	if (trans) {
	5111	// copy A transposed into panels
	5112	for (long i = 0; i < n; i++) {
	5113	const zz_p *row = &A[i][0];
	5114	long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
	5115	for (long j = 0; j < n; j++)
	5116	col[j*MAT_BLK_SZ] = rep(row[j]);
	5117	}
	5118	}
	5119	else {
	5120	// copy A into panels
	5121	for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
	5122	long j_max = min(jj+MAT_BLK_SZ, n);
	5123	long *panelp = &M[panel][0];
	5124
	5125	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	5126	const zz_p *ap = A[i].elts() + jj;
	5127
	5128	for (long j = jj; j < j_max; j++)
	5129	panelp[j-jj] = rep(ap[j-jj]);
	5130	}
	5131	}
	5132	}
	5133
	5134	Vec<long> bv;
	5135	if (bp) conv(bv, *bp);
	5136
	5137	Vec<long> P;
	5138	P.SetLength(n);
	5139	for (long k = 0; k < n; k++) P[k] = k;
	5140	// records swap operations
	5141
	5142
	5143	long det;
	5144	det = 1;
	5145
	5146	long p = zz_p::modulus();
	5147	mulmod_t pinv = zz_p::ModulusInverse();
	5148	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	5149
	5150
	5151	bool pivoting = false;
	5152
	5153	for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
	5154	long k_max = min(kk+MAT_BLK_SZ, n);
	5155
	5156	long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	5157
	5158	for (long k = kk; k < k_max; k++) {
	5159
	5160	long pos = -1;
	5161	long pivot;
	5162	long pivot_inv;
	5163
	5164	for (long i = k; i < n; i++) {
	5165	// NOTE: by using InvModStatus, this code will work
	5166	// for prime-powers as well as primes
	5167	pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
	5168	if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
	5169	pos = i;
	5170	break;
	5171	}
	5172	}
	5173
	5174	if (pos == -1) {
	5175	clear(d);
	5176	return;
	5177	}
	5178
	5179	long * NTL_RESTRICT y = &kpanelp[k*MAT_BLK_SZ];
	5180	if (k != pos) {
	5181	// swap rows pos and k
	5182	long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	5183	for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	5184
	5185	det = NegateMod(det, p);
	5186	P[k] = pos;
	5187	pivoting = true;
	5188
	5189	if (bp) _ntl_swap(bv[pos], bv[k]);
	5190	}
	5191
	5192	det = MulMod(det, pivot, p);
	5193
	5194	{
	5195	// multiply row k by pivot_inv
	5196	long t1 = pivot_inv;
	5197	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	5198	for (long j = 0; j < MAT_BLK_SZ; j++) {
	5199	y[j] = MulModPrecon(y[j], t1, p, t1pinv);
	5200	}
	5201
	5202	y[k-kk] = pivot_inv;
	5203
	5204	if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
	5205	}
	5206
	5207	for (long i = kk; i < n; i++) {
	5208	if (i == k) continue; // skip row k
	5209
	5210	long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	5211	long t1 = x[k-kk];
	5212	t1 = NegateMod(t1, p);
	5213	x[k-kk] = 0;
	5214	if (t1 == 0) continue;
	5215
	5216	// add t1 * row k to row i
	5217	long ut1 = t1;
	5218	muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
	5219	if (bp)
	5220	{
	5221	long t2 = MulMod(bv[k], t1, p);
	5222	bv[i] = AddMod(bv[i], t2, p);
	5223	}
	5224	}
	5225	}
	5226
	5227
	5228	// finished processing current kpanel
	5229	// next, reduce and apply to all other kpanels
	5230
	5231	// special processing: subtract 1 off of diangonal
	5232
	5233	for (long k = kk; k < k_max; k++)
	5234	kpanelp[kMAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	5235
	5236
	5237	bool seq = double(npanels-(kpanel+1))double(n)double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
	5238	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	5239	NTL_IMPORT(p)
	5240	NTL_IMPORT(n)
	5241	NTL_IMPORT(ll_red_struct)
	5242	NTL_IMPORT(kpanel)
	5243	NTL_IMPORT(kpanelp)
	5244	NTL_IMPORT(kk)
	5245	NTL_IMPORT(k_max)
	5246
	5247
	5248	UniqueArray<long> buf_store;
	5249	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	5250	long *buf = &buf_store[0];
	5251
	5252	for (long index = first; index < last; index++) {
	5253	long jpanel = index + kpanel+1;
	5254
	5255	long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	5256
	5257	// perform swaps
	5258	for (long k = kk; k < k_max; k++) {
	5259	long pos = P[k];
	5260	if (pos != k) {
	5261	// swap rows pos and k
	5262	long * NTL_RESTRICT pos_p = &jpanelp[pos*MAT_BLK_SZ];
	5263	long * NTL_RESTRICT k_p = &jpanelp[k*MAT_BLK_SZ];
	5264	for (long j = 0; j < MAT_BLK_SZ; j++)
	5265	_ntl_swap(pos_p[j], k_p[j]);
	5266	}
	5267	}
	5268
	5269	// copy block number kpanel (the one on the diagonal) into buf
	5270	// here, we transpose it
	5271
	5272	for (long k = kk; k < k_max; k++)
	5273	for (long j = 0; j < MAT_BLK_SZ; j++)
	5274	buf[jMAT_BLK_SZ + (k-kk)] = jpanelp[kMAT_BLK_SZ+j];
	5275
	5276	// jpanel += kpanel*buf
	5277
	5278	muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
	5279	}
	5280
	5281	NTL_GEXEC_RANGE_END
	5282
	5283	// special processing: add 1 back to the diangonal
	5284
	5285	for (long k = kk; k < k_max; k++)
	5286	kpanelp[kMAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[kMAT_BLK_SZ+(k-kk)], 1, p);
	5287
	5288	}
	5289
	5290	if (bp) {
	5291	xp->SetLength(n);
	5292	zz_p *X = xp->elts();
	5293
	5294	for (long i = n-1; i >= 0; i--) {
	5295	long t1 = 0;
	5296	long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5297	for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
	5298	jj < n; jj += MAT_BLK_SZ, panel++) {
	5299	long j_max = min(jj+MAT_BLK_SZ, n);
	5300	long row = &M[panel][iMAT_BLK_SZ];
	5301	for (long j = jj; j < j_max; j++) {
	5302	long t0 = row[j-jj];
	5303	long t2 = MulMod(rep(X[j]), t0, p);
	5304	t1 = AddMod(t1, t2, p);
	5305	}
	5306	}
	5307	X[i].LoopHole() = SubMod(bv[i], t1, p);
	5308	}
	5309	}
	5310
	5311	d.LoopHole() = det;
	5312
	5313	}
	5314
	5315
	5316
	5317	#endif
	5318
	5319
	5320
	5321	static
	5322	void tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
	5323	vec_zz_p *xp, bool trans, bool relax)
	5324	{
	5325	long n = A.NumRows();
	5326
	5327	if (A.NumCols() != n)
	5328	LogicError("inv: nonsquare matrix");
	5329
	5330	if (bp && bp->length() != n)
	5331	LogicError("tri: dimension mismatch");
	5332
	5333	if (bp && !xp)
	5334	LogicError("tri: bad args");
	5335
	5336	#ifndef NTL_HAVE_LL_TYPE
	5337
	5338	basic_tri(d, A, bp, xp, trans, relax);
	5339
	5340	#else
	5341
	5342	long p = zz_p::modulus();
	5343
	5344	if (n < 16) {
	5345	//cerr << "basic_tri\n";
	5346	basic_tri(d, A, bp, xp, trans, relax);
	5347	}
	5348	else if (n/MAT_BLK_SZ < 4) {
	5349	long V = 64;
	5350
	5351	#ifdef NTL_HAVE_AVX
	5352	if (p-1 <= MAX_DBL_INT &&
	5353	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	5354	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	5355
	5356	//cerr << "alt_tri_DD\n";
	5357	alt_tri_DD(d, A, bp, xp, trans, relax);
	5358	}
	5359	else
	5360	#endif
	5361	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	5362	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	5363
	5364	//cerr << "alt_tri_L\n";
	5365	alt_tri_L(d, A, bp, xp, trans, relax);
	5366
	5367	}
	5368	else {
	5369
	5370	//cerr << "basic_tri\n";
	5371	basic_tri(d, A, bp, xp, trans, relax);
	5372	}
	5373	}
	5374	else {
	5375	long V = 4*MAT_BLK_SZ;
	5376
	5377	#ifdef NTL_HAVE_AVX
	5378	if (p-1 <= MAX_DBL_INT &&
	5379	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	5380	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	5381
	5382	//cerr << "blk_tri_DD\n";
	5383	blk_tri_DD(d, A, bp, xp, trans, relax);
	5384	}
	5385	else
	5386	#endif
	5387	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	5388	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	5389
	5390	//cerr << "blk_tri_L\n";
	5391	blk_tri_L(d, A, bp, xp, trans, relax);
	5392
	5393	}
	5394	else {
	5395
	5396	//cerr << "blk_tri_LL\n";
	5397	blk_tri_LL(d, A, bp, xp, trans, relax);
	5398	}
	5399
	5400	}
	5401
	5402	#endif
	5403
	5404
	5405
	5406	}
	5407
	5408
	5409
	5410	void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax)
	5411	{
	5412	tri(d, A, 0, 0, false, relax);
	5413	}
	5414
	5415
	5416	void relaxed_solve(zz_p& d, vec_zz_p& x,
	5417	const mat_zz_p& A, const vec_zz_p& b, bool relax)
	5418	{
	5419	tri(d, A, &b, &x, true, relax);
	5420	}
	5421
	5422	void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax)
	5423	{
	5424	tri(d, A, &b, &x, false, relax);
	5425	}
	5426
	5427	// ******************************************************************
	5428	//
	5429	// new image and kernel routines
	5430	//
	5431	// ******************************************************************
	5432
	5433
	5434	static
	5435	long elim_basic(const mat_zz_p& A, mat_zz_p im, mat_zz_p ker,
	5436	long w, bool full)
	5437	{
	5438	long n = A.NumRows();
	5439	long m = A.NumCols();
	5440
	5441	if (w < 0 \|\| w > m) LogicError("elim: bad args");
	5442
	5443	// take care of corner cases
	5444	if (n == 0) {
	5445	if (im) im->SetDims(0, m);
	5446	if (ker) ker->SetDims(0, 0);
	5447	return 0;
	5448	}
	5449
	5450	if (w == 0) {
	5451	if (im) {
	5452	if (full)
	5453	(*im) = A;
	5454	else
	5455	im->SetDims(0, m);
	5456	}
	5457	if (ker) ident(*ker, n);
	5458	return 0;
	5459	}
	5460
	5461	Mat<long> M;
	5462	conv(M, A);
	5463
	5464	Vec<long> P;
	5465	P.SetLength(n);
	5466	for (long k = 0; k < n; k++) P[k] = k;
	5467	// records swap operations
	5468
	5469	Vec<long> pcol;
	5470	pcol.SetLength(n);
	5471	// pcol[i] records pivot columns for row i
	5472
	5473	long p = zz_p::modulus();
	5474	mulmod_t pinv = zz_p::ModulusInverse();
	5475
	5476	bool pivoting = false;
	5477
	5478	long r = 0;
	5479
	5480	for (long k = 0; k < w; k++) {
	5481	long pos = -1;
	5482	long pivot_inv;
	5483	for (long i = r; i < n; i++) {
	5484	long pivot = M[i][k];
	5485	if (pivot != 0) {
	5486	pivot_inv = InvMod(pivot, p);
	5487	pos = i;
	5488	break;
	5489	}
	5490	}
	5491
	5492	if (pos == -1)
	5493	continue;
	5494
	5495	if (r != pos) {
	5496	swap(M[pos], M[r]);
	5497	P[r] = pos;
	5498	pivoting = true;
	5499	}
	5500
	5501	bool seq = double(n-r)*double(m-k) < PAR_THRESH;
	5502
	5503	NTL_GEXEC_RANGE(seq, n-(r+1), first, last)
	5504	NTL_IMPORT(p)
	5505	NTL_IMPORT(n)
	5506	NTL_IMPORT(k)
	5507	NTL_IMPORT(r)
	5508	long * NTL_RESTRICT y = &M[r][0];
	5509
	5510	for (long ii = first; ii < last; ii++) {
	5511	long i = ii + r+1;
	5512
	5513	long * NTL_RESTRICT x = &M[i][0];
	5514	long t1 = x[k];
	5515	t1 = MulMod(t1, pivot_inv, p);
	5516	t1 = NegateMod(t1, p);
	5517	x[k] = t1;
	5518	if (t1 == 0) continue;
	5519
	5520	// add t1 * row r to row i
	5521	mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
	5522
	5523	for (long j = k+1; j < m; j++) {
	5524	long t2 = MulModPrecon(y[j], t1, p, t1pinv);
	5525	x[j] = AddMod(x[j], t2, p);
	5526	}
	5527	}
	5528	NTL_GEXEC_RANGE_END
	5529
	5530	pcol[r] = k;
	5531	r++;
	5532	}
	5533
	5534	if (im) {
	5535	mat_zz_p& Im = *im;;
	5536	if (full)
	5537	Im.SetDims(n, m);
	5538	else
	5539	Im.SetDims(r, m);
	5540
	5541	for (long i = 0; i < r; i++) {
	5542	long pc = pcol[i];
	5543	for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
	5544	for (long j = pc; j < m; j++) Im[i][j].LoopHole() = M[i][j];
	5545	}
	5546
	5547	if (full) {
	5548	for (long i = r; i < n; i++) {
	5549	for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
	5550	for (long j = w; j < m; j++) Im[i][j].LoopHole() = M[i][j];
	5551	}
	5552	}
	5553	}
	5554
	5555	if (ker) {
	5556
	5557	if (n == r) {
	5558	mat_zz_p& Ker = *ker;
	5559	Ker.SetDims(n-r, n);
	5560	}
	5561	else {
	5562	Mat<long> colbuf;
	5563	colbuf.SetDims(r, n);
	5564
	5565	for (long k = 0; k < r; k++) {
	5566	long pc = pcol[k];
	5567	for (long i = k+1; i < n; i++) colbuf[k][i] = M[i][pc];
	5568	}
	5569
	5570	M.kill();
	5571
	5572	Mat<long> X;
	5573	X.SetDims(n-r, r);
	5574
	5575	bool seq = double(n-r)double(r)double(r)/2 < PAR_THRESH;
	5576	NTL_GEXEC_RANGE(seq, n-r, first, last)
	5577	NTL_IMPORT(p)
	5578	NTL_IMPORT(r)
	5579
	5580	for (long i = first; i < last; i++) {
	5581	long *Xi = &X[i][0];
	5582
	5583	for (long k = r-1; k >= 0; k--) {
	5584	long *cvecp = &colbuf[k][0];
	5585
	5586	long acc = cvecp[i+r];
	5587	for (long j = k+1; j < r; j++) {
	5588	acc = AddMod( acc, MulMod(Xi[j], cvecp[j], p), p );
	5589	}
	5590	Xi[k] = acc;
	5591	}
	5592
	5593	}
	5594
	5595	NTL_GEXEC_RANGE_END
	5596
	5597	mat_zz_p& Ker = *ker;
	5598	Ker.SetDims(n-r, n);
	5599	for (long i = 0; i < n-r; i++) {
	5600	for (long j = 0; j < r; j++) Ker[i][j].LoopHole() = X[i][j];
	5601	for (long j = r; j < n; j++) Ker[i][j].LoopHole() = 0;
	5602	Ker[i][r+i].LoopHole() = 1;
	5603	}
	5604
	5605	if (pivoting) {
	5606	for (long i = 0; i < n-r; i++) {
	5607	zz_p *x = Ker[i].elts();
	5608
	5609	for (long k = n-1; k >= 0; k--) {
	5610	long pos = P[k];
	5611	if (pos != k) swap(x[pos], x[k]);
	5612	}
	5613	}
	5614	}
	5615	}
	5616	}
	5617
	5618	return r;
	5619	}
	5620
	5621	#ifdef NTL_HAVE_LL_TYPE
	5622
	5623
	5624	#ifdef NTL_HAVE_AVX
	5625
	5626
	5627	static inline
	5628	void CopyBlock(double dst_ptr, long dst_blk, const double src_ptr, long src_blk, long src_limit)
	5629	{
	5630	long src_row = src_blk*MAT_BLK_SZ;
	5631	long dst_row = dst_blk*MAT_BLK_SZ;
	5632
	5633	long nrows = min(MAT_BLK_SZ, src_limit - src_row);
	5634
	5635	for (long i = 0; i < nrows; i++)
	5636	for (long j = 0; j < MAT_BLK_SZ; j++)
	5637	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	5638
	5639	for (long i = nrows; i < MAT_BLK_SZ; i++)
	5640	for (long j = 0; j < MAT_BLK_SZ; j++)
	5641	dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
	5642
	5643	}
	5644
	5645	static inline
	5646	void CopyBlock(double dst_ptr, long dst_blk, const double src_ptr, long src_blk)
	5647	{
	5648	long src_row = src_blk*MAT_BLK_SZ;
	5649	long dst_row = dst_blk*MAT_BLK_SZ;
	5650
	5651	long nrows = MAT_BLK_SZ;
	5652
	5653	for (long i = 0; i < nrows; i++)
	5654	for (long j = 0; j < MAT_BLK_SZ; j++)
	5655	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	5656	}
	5657
	5658	static inline
	5659	void SwapOneRow(double *panelp, long i, long pos)
	5660	{
	5661	double * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
	5662	double * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	5663	for (long j = 0; j < MAT_BLK_SZ; j++)
	5664	_ntl_swap(pos_p[j], i_p[j]);
	5665	}
	5666
	5667	static inline
	5668	void ApplySwaps(double *panelp, long start, long end, const Vec<long>& P)
	5669	{
	5670	for (long i = start; i < end; i++) {
	5671	long pos = P[i];
	5672	if (pos != i)
	5673	SwapOneRow(panelp, i, pos);
	5674	}
	5675	}
	5676
	5677
	5678	static inline
	5679	void MulAddBlock(double x, const double y, const double *z)
	5680	{
	5681	// x += y*z
	5682	muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
	5683	}
	5684
	5685
	5686	static
	5687	long elim_blk_DD(const mat_zz_p& A, mat_zz_p im, mat_zz_p ker,
	5688	long w, bool full)
	5689	{
	5690	long n = A.NumRows();
	5691	long m = A.NumCols();
	5692
	5693	if (w < 0 \|\| w > m) LogicError("elim: bad args");
	5694
	5695	// take care of corner cases
	5696	if (n == 0) {
	5697	if (im) im->SetDims(0, m);
	5698	if (ker) ker->SetDims(0, 0);
	5699	return 0;
	5700	}
	5701
	5702	if (w == 0) {
	5703	if (im) {
	5704	if (full)
	5705	(*im) = A;
	5706	else
	5707	im->SetDims(0, m);
	5708	}
	5709	if (ker) ident(*ker, n);
	5710	return 0;
	5711	}
	5712
	5713	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	5714	if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	5715
	5716	long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5717
	5718
	5719	Vec< AlignedArray<double> > M;
	5720	M.SetLength(npanels);
	5721	for (long panel = 0; panel < npanels; panel++) {
	5722	M[panel].SetLength(n*MAT_BLK_SZ);
	5723	double *panelp = &M[panel][0];
	5724
	5725	for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
	5726	}
	5727
	5728	// copy A into panels
	5729	for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
	5730	long j_max = min(jj+MAT_BLK_SZ, m);
	5731	double *panelp = &M[panel][0];
	5732
	5733	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	5734	const zz_p *ap = A[i].elts() + jj;
	5735
	5736	for (long j = jj; j < j_max; j++)
	5737	panelp[j-jj] = rep(ap[j-jj]);
	5738	}
	5739	}
	5740
	5741	AlignedArray<double> aux_panel_store;
	5742	aux_panel_store.SetLength(n*MAT_BLK_SZ);
	5743	double * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	5744
	5745
	5746	AlignedArray<double> buf_store1;
	5747	buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	5748	double *buf1 = &buf_store1[0];
	5749
	5750	Vec<long> P;
	5751	P.SetLength(n);
	5752	for (long k = 0; k < n; k++) P[k] = k;
	5753	// records swap operations
	5754
	5755	Vec<long> pcol;
	5756	pcol.SetLength(n);
	5757	// pcol[i] records pivot columns for row i
	5758
	5759	long p = zz_p::modulus();
	5760	mulmod_t pinv = zz_p::ModulusInverse();
	5761	sp_reduce_struct red_struct = zz_p::red_struct();
	5762
	5763	bool pivoting = false;
	5764
	5765	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	5766	long red_count = red_trigger;
	5767
	5768	long r = 0, rr = 0, k = 0, kk = 0;
	5769	long rpanel = 0, kpanel = 0;
	5770
	5771	while (k < w) {
	5772
	5773	if (r > rr && ker) {
	5774	// we have a panel from a previous iteration
	5775	// we store enough of it to facilitate the kernel
	5776	// computation later. At this point, we have
	5777	// r == rr+INV_BLK_SIZE, and it suffices to store
	5778	// rows [r..n) into M[rpanel], and this will not
	5779	// overwrite anything useful in M[rpanel]
	5780
	5781	double *panelp = &M[rpanel][0];
	5782	for (long h = rMAT_BLK_SZ; h < nMAT_BLK_SZ; h++) {
	5783	panelp[h] = aux_panel[h];
	5784	}
	5785
	5786	rpanel++;
	5787	}
	5788
	5789	rr = r;
	5790
	5791	for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
	5792
	5793	bool cleanup = false;
	5794
	5795	if (red_count-MAT_BLK_SZ < 0) {
	5796	red_count = red_trigger;
	5797	cleanup = true;
	5798	}
	5799
	5800	red_count = red_count-MAT_BLK_SZ;
	5801
	5802	for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
	5803
	5804	if (k == kk+MAT_BLK_SZ) { // start new kpanel
	5805	kk = k;
	5806	kpanel++;
	5807	}
	5808
	5809	double * NTL_RESTRICT kpanelp = &M[kpanel][0];
	5810
	5811	if (k == kk) { // a fresh kpanel -- special processing
	5812
	5813	if (cleanup) {
	5814	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	5815	kpanelp[h] = rem((unsigned long)(long)kpanelp[h], p, red_struct);
	5816	}
	5817
	5818	if (r > rr) {
	5819
	5820
	5821	// apply current sequence of permutations
	5822
	5823	ApplySwaps(kpanelp, rr, r, P);
	5824
	5825	// clean aux_panel
	5826	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	5827	aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
	5828
	5829	// copy rows [rr..r) of kpanel into buf1
	5830	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	5831	buf1[i] = rem((unsigned long)(long)kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
	5832
	5833	// kpanel[rr..n) += aux_panel[rr..n)*buf1
	5834
	5835	muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
	5836	}
	5837	}
	5838
	5839	long pos = -1;
	5840	long pivot;
	5841	long pivot_inv;
	5842	for (long i = r; i < n; i++) {
	5843	pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	5844	kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
	5845
	5846	if (pivot != 0) {
	5847	pivot_inv = InvMod(pivot, p);
	5848	pos = i;
	5849	break;
	5850	}
	5851	}
	5852
	5853	if (pos == -1) {
	5854	continue;
	5855	}
	5856
	5857	double * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
	5858	double * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	5859	if (r != pos) {
	5860	// swap rows pos and r
	5861	double * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	5862	double * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	5863
	5864	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	5865	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
	5866
	5867	P[r] = pos;
	5868	pivoting = true;
	5869	}
	5870
	5871	// clean up row r of kpanel and aux_panel
	5872	for (long j = k-kk; j < MAT_BLK_SZ; j++)
	5873	y[j] = rem((unsigned long)(long)y[j], p, red_struct);
	5874	for (long j = 0; j < r-rr; j++)
	5875	y1[j] = rem((unsigned long)(long)y1[j], p, red_struct);
	5876
	5877	// clear column
	5878	for (long i = r+1; i < n; i++) {
	5879	double * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	5880	double * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	5881	long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
	5882	t1 = MulMod(t1, pivot_inv, p);
	5883	t1 = NegateMod(t1, p);
	5884	x[k-kk] = 0;
	5885	x1[r-rr] = t1;
	5886	if (t1 == 0) continue;
	5887
	5888	// add t1 * row r to row i
	5889	double ut1 = t1;
	5890
	5891	for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
	5892	x[j] += y[j]*ut1;
	5893	for (long j = 0; j < r-rr; j++)
	5894	x1[j] += y1[j]*ut1;
	5895	}
	5896
	5897	pcol[r] = k;
	5898	r++;
	5899	}
	5900
	5901	if (r > rr) {
	5902
	5903	// we have a panel
	5904
	5905	// clean it up
	5906	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	5907	aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
	5908
	5909	bool seq =
	5910	double(npanels-(kpanel+1))double(n-rr)double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
	5911
	5912	// apply aux_panel to remaining panels: [kpanel+1..npanels)
	5913	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	5914	NTL_IMPORT(p)
	5915	NTL_IMPORT(n)
	5916	NTL_IMPORT(red_struct)
	5917	NTL_IMPORT(aux_panel)
	5918	NTL_IMPORT(rr)
	5919	NTL_IMPORT(r)
	5920
	5921
	5922	AlignedArray<double> buf_store;
	5923	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	5924	double *buf = &buf_store[0];
	5925
	5926
	5927	for (long index = first; index < last; index++) {
	5928	long jpanel = index + kpanel+1;
	5929
	5930	double * NTL_RESTRICT jpanelp = &M[jpanel][0];
	5931
	5932	if (cleanup) {
	5933	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	5934	jpanelp[h] = rem((unsigned long)(long)jpanelp[h], p, red_struct);
	5935	}
	5936
	5937	// perform swaps
	5938	ApplySwaps(jpanelp, rr, r, P);
	5939
	5940	// copy rows [rr..r) of jpanel into buf
	5941	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	5942	buf[i] = rem((unsigned long)(long)jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
	5943
	5944	// jpanel[rr..n) += aux_panel[rr..n)*buf
	5945
	5946	muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
	5947	}
	5948
	5949	NTL_GEXEC_RANGE_END
	5950
	5951	}
	5952
	5953	}
	5954
	5955	if (im) {
	5956	mat_zz_p& Im = *im;;
	5957	if (full)
	5958	Im.SetDims(n, m);
	5959	else
	5960	Im.SetDims(r, m);
	5961
	5962	for (long i = 0; i < r; i++) {
	5963	long pc = pcol[i];
	5964	for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
	5965	for (long j = pc; j < m; j++) {
	5966	double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	5967	Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
	5968	}
	5969	}
	5970
	5971	if (full) {
	5972	for (long i = r; i < n; i++) {
	5973	for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
	5974	for (long j = w; j < m; j++) {
	5975	double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	5976	Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
	5977	}
	5978	}
	5979	}
	5980	}
	5981
	5982	if (ker) {
	5983	mat_zz_p& Ker = *ker;
	5984	Ker.SetDims(n-r, n);
	5985	if (r < n) {
	5986
	5987	long start_block = r/MAT_BLK_SZ;
	5988	long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5989	long vblocks = end_block-start_block;
	5990	long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5991
	5992	Vec< AlignedArray<double> > kerbuf;
	5993	kerbuf.SetLength(vblocks);
	5994	for (long i = 0; i < vblocks; i++)
	5995	kerbuf[i].SetLength(hblocksMAT_BLK_SZMAT_BLK_SZ);
	5996
	5997	long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	5998
	5999	// if r > rr, we have a panel sitting in
	6000	// aux_panel, which may or may not be a full panel
	6001
	6002	double *initial_panel = 0;
	6003	if (r > rr) {
	6004	initial_panel = aux_panel;
	6005	}
	6006	else {
	6007	initial_panel = &M[hblocks-1][0];
	6008	}
	6009
	6010	for (long vb = start_block; vb < end_block; vb++)
	6011	CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
	6012
	6013	for (long hb = hblocks-2; hb >= 0; hb--) {
	6014
	6015	ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
	6016
	6017	for (long b = hb+1; b < end_block; b++)
	6018	CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
	6019	}
	6020
	6021	bool seq = double(n-r)double(r)double(r)/2 < PAR_THRESH;
	6022
	6023
	6024	NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
	6025	NTL_IMPORT(p)
	6026	NTL_IMPORT(red_struct)
	6027	NTL_IMPORT(hblocks)
	6028
	6029	for (long index = first; index < last; index++) {
	6030	long vb = index + start_block;
	6031	double *kerbufp = &kerbuf[vb-start_block][0];
	6032
	6033	for (long hb = hblocks-2; hb >= 0; hb--) {
	6034	double *colbuf = &M[hb][0];
	6035	double acc = &kerbufp[hbMAT_BLK_SZ*MAT_BLK_SZ];
	6036
	6037	CopyBlock(acc, 0, colbuf, vb-1);
	6038
	6039	long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
	6040	long red_count = red_trigger;
	6041
	6042	for (long b = hb+1; b < hblocks; b++) {
	6043
	6044	if (red_count-MAT_BLK_SZ < 0) {
	6045	red_count = red_trigger;
	6046	for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
	6047	acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
	6048
	6049	}
	6050	red_count = red_count-MAT_BLK_SZ;
	6051
	6052	MulAddBlock(acc, &kerbufp[bMAT_BLK_SZMAT_BLK_SZ],
	6053	&colbuf[(b-1)MAT_BLK_SZMAT_BLK_SZ]);
	6054	}
	6055
	6056	for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
	6057	acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
	6058	}
	6059	}
	6060
	6061	NTL_GEXEC_RANGE_END
	6062
	6063	for (long i = r; i < n; i++) {
	6064
	6065	double *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
	6066
	6067	for (long j = 0; j < r; j++) {
	6068	double t0 =
	6069	kerbufp[(j/MAT_BLK_SZ)MAT_BLK_SZMAT_BLK_SZ+
	6070	(i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6071
	6072	Ker[i-r][j].LoopHole() = long(t0);
	6073	}
	6074	}
	6075
	6076	for (long i = 0; i < n-r; i++) {
	6077	for (long j = 0; j < n-r; j++) {
	6078	Ker[i][j+r].LoopHole() = 0;
	6079	}
	6080	Ker[i][i+r].LoopHole() = 1;
	6081	}
	6082
	6083	if (pivoting) {
	6084	for (long i = 0; i < n-r; i++) {
	6085	zz_p *x = Ker[i].elts();
	6086
	6087	for (long k = n-1; k >= 0; k--) {
	6088	long pos = P[k];
	6089	if (pos != k) swap(x[pos], x[k]);
	6090	}
	6091	}
	6092	}
	6093	}
	6094	}
	6095
	6096	return r;
	6097
	6098	}
	6099
	6100	#endif
	6101
	6102
	6103
	6104	static inline
	6105	void CopyBlock(unsigned long dst_ptr, long dst_blk, const unsigned long src_ptr, long src_blk, long src_limit)
	6106	{
	6107	long src_row = src_blk*MAT_BLK_SZ;
	6108	long dst_row = dst_blk*MAT_BLK_SZ;
	6109
	6110	long nrows = min(MAT_BLK_SZ, src_limit - src_row);
	6111
	6112	for (long i = 0; i < nrows; i++)
	6113	for (long j = 0; j < MAT_BLK_SZ; j++)
	6114	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	6115
	6116	for (long i = nrows; i < MAT_BLK_SZ; i++)
	6117	for (long j = 0; j < MAT_BLK_SZ; j++)
	6118	dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
	6119
	6120	}
	6121
	6122	static inline
	6123	void CopyBlock(unsigned long dst_ptr, long dst_blk, const unsigned long src_ptr, long src_blk)
	6124	{
	6125	long src_row = src_blk*MAT_BLK_SZ;
	6126	long dst_row = dst_blk*MAT_BLK_SZ;
	6127
	6128	long nrows = MAT_BLK_SZ;
	6129
	6130	for (long i = 0; i < nrows; i++)
	6131	for (long j = 0; j < MAT_BLK_SZ; j++)
	6132	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	6133	}
	6134
	6135	static inline
	6136	void TransposeBlock(unsigned long *dst_ptr, long dst_blk)
	6137	{
	6138	dst_ptr += dst_blkMAT_BLK_SZMAT_BLK_SZ;
	6139
	6140	for (long i = 0; i < MAT_BLK_SZ; i++)
	6141	for (long j = 0; j < i; j++)
	6142	_ntl_swap(dst_ptr[iMAT_BLK_SZ+j], dst_ptr[i+jMAT_BLK_SZ]);
	6143	}
	6144
	6145	static inline
	6146	void SwapOneRow(unsigned long *panelp, long i, long pos)
	6147	{
	6148	unsigned long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
	6149	unsigned long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	6150	for (long j = 0; j < MAT_BLK_SZ; j++)
	6151	_ntl_swap(pos_p[j], i_p[j]);
	6152	}
	6153
	6154	static inline
	6155	void ApplySwaps(unsigned long *panelp, long start, long end, const Vec<long>& P)
	6156	{
	6157	for (long i = start; i < end; i++) {
	6158	long pos = P[i];
	6159	if (pos != i)
	6160	SwapOneRow(panelp, i, pos);
	6161	}
	6162	}
	6163
	6164
	6165	static inline
	6166	void MulAddBlock(unsigned long x, const unsigned long y, const unsigned long *z)
	6167	{
	6168	// x += y*z
	6169
	6170	muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
	6171	}
	6172
	6173
	6174	static
	6175	long elim_blk_L(const mat_zz_p& A, mat_zz_p im, mat_zz_p ker,
	6176	long w, bool full)
	6177	{
	6178	long n = A.NumRows();
	6179	long m = A.NumCols();
	6180
	6181	if (w < 0 \|\| w > m) LogicError("elim: bad args");
	6182
	6183	// take care of corner cases
	6184	if (n == 0) {
	6185	if (im) im->SetDims(0, m);
	6186	if (ker) ker->SetDims(0, 0);
	6187	return 0;
	6188	}
	6189
	6190	if (w == 0) {
	6191	if (im) {
	6192	if (full)
	6193	(*im) = A;
	6194	else
	6195	im->SetDims(0, m);
	6196	}
	6197	if (ker) ident(*ker, n);
	6198	return 0;
	6199	}
	6200
	6201	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	6202	if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	6203
	6204	long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6205
	6206
	6207	Vec< UniqueArray<unsigned long> > M;
	6208	M.SetLength(npanels);
	6209	for (long panel = 0; panel < npanels; panel++) {
	6210	M[panel].SetLength(n*MAT_BLK_SZ);
	6211	unsigned long *panelp = &M[panel][0];
	6212
	6213	for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
	6214	}
	6215
	6216	// copy A into panels
	6217	for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
	6218	long j_max = min(jj+MAT_BLK_SZ, m);
	6219	unsigned long *panelp = &M[panel][0];
	6220
	6221	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	6222	const zz_p *ap = A[i].elts() + jj;
	6223
	6224	for (long j = jj; j < j_max; j++)
	6225	panelp[j-jj] = rep(ap[j-jj]);
	6226	}
	6227	}
	6228
	6229	UniqueArray<unsigned long> aux_panel_store;
	6230	aux_panel_store.SetLength(n*MAT_BLK_SZ);
	6231	unsigned long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	6232
	6233
	6234	UniqueArray<unsigned long> buf_store1;
	6235	buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	6236	unsigned long *buf1 = &buf_store1[0];
	6237
	6238	Vec<long> P;
	6239	P.SetLength(n);
	6240	for (long k = 0; k < n; k++) P[k] = k;
	6241	// records swap operations
	6242
	6243	Vec<long> pcol;
	6244	pcol.SetLength(n);
	6245	// pcol[i] records pivot columns for row i
	6246
	6247	long p = zz_p::modulus();
	6248	mulmod_t pinv = zz_p::ModulusInverse();
	6249	sp_reduce_struct red_struct = zz_p::red_struct();
	6250
	6251	bool pivoting = false;
	6252
	6253	unsigned long ured_trigger =
	6254	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	6255	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	6256
	6257	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	6258
	6259	long red_count = red_trigger;
	6260
	6261	long r = 0, rr = 0, k = 0, kk = 0;
	6262	long rpanel = 0, kpanel = 0;
	6263
	6264	while (k < w) {
	6265
	6266	if (r > rr && ker) {
	6267	// we have a panel from a previous iteration
	6268	// we store enough of it to facilitate the kernel
	6269	// computation later. At this point, we have
	6270	// r == rr+INV_BLK_SIZE, and it suffices to store
	6271	// rows [r..n) into M[rpanel], and this will not
	6272	// overwrite anything useful in M[rpanel]
	6273
	6274	unsigned long *panelp = &M[rpanel][0];
	6275	for (long h = rMAT_BLK_SZ; h < nMAT_BLK_SZ; h++) {
	6276	panelp[h] = aux_panel[h];
	6277	}
	6278
	6279	rpanel++;
	6280	}
	6281
	6282	rr = r;
	6283
	6284	for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
	6285
	6286	bool cleanup = false;
	6287
	6288	if (red_count-MAT_BLK_SZ < 0) {
	6289	red_count = red_trigger;
	6290	cleanup = true;
	6291	}
	6292
	6293	red_count = red_count-MAT_BLK_SZ;
	6294
	6295	for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
	6296
	6297	if (k == kk+MAT_BLK_SZ) { // start new kpanel
	6298	kk = k;
	6299	kpanel++;
	6300	}
	6301
	6302	unsigned long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	6303
	6304	if (k == kk) { // a fresh kpanel -- special processing
	6305
	6306	if (cleanup) {
	6307	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	6308	kpanelp[h] = rem(kpanelp[h], p, red_struct);
	6309	}
	6310
	6311	if (r > rr) {
	6312
	6313
	6314	// apply current sequence of permutations
	6315
	6316	ApplySwaps(kpanelp, rr, r, P);
	6317
	6318	// clean aux_panel
	6319	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	6320	aux_panel[h] = rem(aux_panel[h], p, red_struct);
	6321
	6322	// copy rows [rr..r) of kpanel into buf1
	6323	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	6324	buf1[i] = rem(kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
	6325
	6326	TransposeBlock(buf1, 0);
	6327
	6328	// kpanel[rr..n) += aux_panel[rr..n)*buf1
	6329
	6330	muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
	6331	}
	6332	}
	6333
	6334	long pos = -1;
	6335	long pivot;
	6336	long pivot_inv;
	6337	for (long i = r; i < n; i++) {
	6338	pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
	6339	kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
	6340
	6341	if (pivot != 0) {
	6342	pivot_inv = InvMod(pivot, p);
	6343	pos = i;
	6344	break;
	6345	}
	6346	}
	6347
	6348	if (pos == -1) {
	6349	continue;
	6350	}
	6351
	6352	unsigned long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
	6353	unsigned long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	6354	if (r != pos) {
	6355	// swap rows pos and r
	6356	unsigned long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	6357	unsigned long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	6358
	6359	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	6360	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
	6361
	6362	P[r] = pos;
	6363	pivoting = true;
	6364	}
	6365
	6366	// clean up row r of kpanel and aux_panel
	6367	for (long j = k-kk; j < MAT_BLK_SZ; j++)
	6368	y[j] = rem(y[j], p, red_struct);
	6369	for (long j = 0; j < r-rr; j++)
	6370	y1[j] = rem(y1[j], p, red_struct);
	6371
	6372	// clear column
	6373	for (long i = r+1; i < n; i++) {
	6374	unsigned long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	6375	unsigned long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	6376	long t1 = rem(x[k-kk], p, red_struct);
	6377	t1 = MulMod(t1, pivot_inv, p);
	6378	t1 = NegateMod(t1, p);
	6379	x[k-kk] = 0;
	6380	x1[r-rr] = t1;
	6381	if (t1 == 0) continue;
	6382
	6383	// add t1 * row r to row i
	6384	unsigned long ut1 = t1;
	6385
	6386	for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
	6387	x[j] += y[j]*ut1;
	6388	for (long j = 0; j < r-rr; j++)
	6389	x1[j] += y1[j]*ut1;
	6390	}
	6391
	6392	pcol[r] = k;
	6393	r++;
	6394	}
	6395
	6396	if (r > rr) {
	6397
	6398	// we have a panel
	6399
	6400	// clean it up
	6401	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	6402	aux_panel[h] = rem(aux_panel[h], p, red_struct);
	6403
	6404	bool seq =
	6405	double(npanels-(kpanel+1))double(n-rr)double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
	6406
	6407	// apply aux_panel to remaining panels: [kpanel+1..npanels)
	6408	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	6409	NTL_IMPORT(p)
	6410	NTL_IMPORT(n)
	6411	NTL_IMPORT(red_struct)
	6412	NTL_IMPORT(aux_panel)
	6413	NTL_IMPORT(rr)
	6414	NTL_IMPORT(r)
	6415
	6416
	6417	UniqueArray<unsigned long> buf_store;
	6418	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	6419	unsigned long *buf = &buf_store[0];
	6420
	6421
	6422	for (long index = first; index < last; index++) {
	6423	long jpanel = index + kpanel+1;
	6424
	6425	unsigned long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	6426
	6427	if (cleanup) {
	6428	for (long h = 0; h < n*MAT_BLK_SZ; h++)
	6429	jpanelp[h] = rem(jpanelp[h], p, red_struct);
	6430	}
	6431
	6432	// perform swaps
	6433	ApplySwaps(jpanelp, rr, r, P);
	6434
	6435	// copy rows [rr..r) of jpanel into buf
	6436	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	6437	buf[i] = rem(jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
	6438
	6439	TransposeBlock(buf, 0);
	6440
	6441	// jpanel[rr..n) += aux_panel[rr..n)*buf
	6442
	6443	muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
	6444	}
	6445
	6446	NTL_GEXEC_RANGE_END
	6447
	6448	}
	6449
	6450	}
	6451
	6452	if (im) {
	6453	mat_zz_p& Im = *im;;
	6454	if (full)
	6455	Im.SetDims(n, m);
	6456	else
	6457	Im.SetDims(r, m);
	6458
	6459	for (long i = 0; i < r; i++) {
	6460	long pc = pcol[i];
	6461	for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
	6462	for (long j = pc; j < m; j++) {
	6463	unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6464	Im[i][j].LoopHole() = rem(t0, p, red_struct);
	6465	}
	6466	}
	6467
	6468	if (full) {
	6469	for (long i = r; i < n; i++) {
	6470	for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
	6471	for (long j = w; j < m; j++) {
	6472	unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6473	Im[i][j].LoopHole() = rem(t0, p, red_struct);
	6474	}
	6475	}
	6476	}
	6477	}
	6478
	6479	if (ker) {
	6480	mat_zz_p& Ker = *ker;
	6481	Ker.SetDims(n-r, n);
	6482	if (r < n) {
	6483
	6484	long start_block = r/MAT_BLK_SZ;
	6485	long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6486	long vblocks = end_block-start_block;
	6487	long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6488
	6489	Vec< UniqueArray<unsigned long> > kerbuf;
	6490	kerbuf.SetLength(vblocks);
	6491	for (long i = 0; i < vblocks; i++)
	6492	kerbuf[i].SetLength(hblocksMAT_BLK_SZMAT_BLK_SZ);
	6493
	6494	long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6495
	6496	// if r > rr, we have a panel sitting in
	6497	// aux_panel, which may or may not be a full panel
	6498
	6499	unsigned long *initial_panel = 0;
	6500	if (r > rr) {
	6501	initial_panel = aux_panel;
	6502	}
	6503	else {
	6504	initial_panel = &M[hblocks-1][0];
	6505	}
	6506
	6507	for (long vb = start_block; vb < end_block; vb++)
	6508	CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
	6509
	6510	for (long hb = hblocks-2; hb >= 0; hb--) {
	6511
	6512	ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
	6513
	6514	for (long b = hb+1; b < end_block; b++) {
	6515	CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
	6516	TransposeBlock(&M[hb][0], b-1);
	6517	}
	6518	}
	6519
	6520	bool seq = double(n-r)double(r)double(r)/2 < PAR_THRESH;
	6521
	6522
	6523	NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
	6524	NTL_IMPORT(p)
	6525	NTL_IMPORT(red_struct)
	6526	NTL_IMPORT(hblocks)
	6527
	6528	for (long index = first; index < last; index++) {
	6529	long vb = index + start_block;
	6530	unsigned long *kerbufp = &kerbuf[vb-start_block][0];
	6531
	6532	for (long hb = hblocks-2; hb >= 0; hb--) {
	6533	unsigned long *colbuf = &M[hb][0];
	6534	unsigned long acc = &kerbufp[hbMAT_BLK_SZ*MAT_BLK_SZ];
	6535
	6536	CopyBlock(acc, 0, colbuf, vb-1);
	6537	TransposeBlock(acc, 0);
	6538
	6539
	6540	unsigned long ured_trigger =
	6541	(~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
	6542	// NOTE: corner case at p == 2: need unsigned long to prevent overflow
	6543
	6544	long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
	6545	long red_count = red_trigger;
	6546
	6547	for (long b = hb+1; b < hblocks; b++) {
	6548
	6549	if (red_count-MAT_BLK_SZ < 0) {
	6550	red_count = red_trigger;
	6551	for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
	6552	acc[h] = rem(acc[h], p, red_struct);
	6553
	6554	}
	6555	red_count = red_count-MAT_BLK_SZ;
	6556
	6557	MulAddBlock(acc, &kerbufp[bMAT_BLK_SZMAT_BLK_SZ],
	6558	&colbuf[(b-1)MAT_BLK_SZMAT_BLK_SZ]);
	6559	}
	6560
	6561	for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
	6562	acc[h] = rem(acc[h], p, red_struct);
	6563	}
	6564	}
	6565
	6566	NTL_GEXEC_RANGE_END
	6567
	6568	for (long i = r; i < n; i++) {
	6569
	6570	unsigned long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
	6571
	6572	for (long j = 0; j < r; j++) {
	6573	unsigned long t0 =
	6574	kerbufp[(j/MAT_BLK_SZ)MAT_BLK_SZMAT_BLK_SZ+
	6575	(i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6576
	6577	Ker[i-r][j].LoopHole() = long(t0);
	6578	}
	6579	}
	6580
	6581	for (long i = 0; i < n-r; i++) {
	6582	for (long j = 0; j < n-r; j++) {
	6583	Ker[i][j+r].LoopHole() = 0;
	6584	}
	6585	Ker[i][i+r].LoopHole() = 1;
	6586	}
	6587
	6588	if (pivoting) {
	6589	for (long i = 0; i < n-r; i++) {
	6590	zz_p *x = Ker[i].elts();
	6591
	6592	for (long k = n-1; k >= 0; k--) {
	6593	long pos = P[k];
	6594	if (pos != k) swap(x[pos], x[k]);
	6595	}
	6596	}
	6597	}
	6598	}
	6599	}
	6600
	6601	return r;
	6602
	6603	}
	6604
	6605
	6606	static inline
	6607	void CopyBlock(long dst_ptr, long dst_blk, const long src_ptr, long src_blk, long src_limit)
	6608	{
	6609	long src_row = src_blk*MAT_BLK_SZ;
	6610	long dst_row = dst_blk*MAT_BLK_SZ;
	6611
	6612	long nrows = min(MAT_BLK_SZ, src_limit - src_row);
	6613
	6614	for (long i = 0; i < nrows; i++)
	6615	for (long j = 0; j < MAT_BLK_SZ; j++)
	6616	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	6617
	6618	for (long i = nrows; i < MAT_BLK_SZ; i++)
	6619	for (long j = 0; j < MAT_BLK_SZ; j++)
	6620	dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
	6621
	6622	}
	6623
	6624	static inline
	6625	void CopyBlock(long dst_ptr, long dst_blk, const long src_ptr, long src_blk)
	6626	{
	6627	long src_row = src_blk*MAT_BLK_SZ;
	6628	long dst_row = dst_blk*MAT_BLK_SZ;
	6629
	6630	long nrows = MAT_BLK_SZ;
	6631
	6632	for (long i = 0; i < nrows; i++)
	6633	for (long j = 0; j < MAT_BLK_SZ; j++)
	6634	dst_ptr[(dst_row + i)MAT_BLK_SZ + j] = src_ptr[(src_row + i)MAT_BLK_SZ + j];
	6635	}
	6636
	6637	static inline
	6638	void TransposeBlock(long *dst_ptr, long dst_blk)
	6639	{
	6640	dst_ptr += dst_blkMAT_BLK_SZMAT_BLK_SZ;
	6641
	6642	for (long i = 0; i < MAT_BLK_SZ; i++)
	6643	for (long j = 0; j < i; j++)
	6644	_ntl_swap(dst_ptr[iMAT_BLK_SZ+j], dst_ptr[i+jMAT_BLK_SZ]);
	6645	}
	6646
	6647	static inline
	6648	void SwapOneRow(long *panelp, long i, long pos)
	6649	{
	6650	long * NTL_RESTRICT pos_p = &panelp[pos*MAT_BLK_SZ];
	6651	long * NTL_RESTRICT i_p = &panelp[i*MAT_BLK_SZ];
	6652	for (long j = 0; j < MAT_BLK_SZ; j++)
	6653	_ntl_swap(pos_p[j], i_p[j]);
	6654	}
	6655
	6656	static inline
	6657	void ApplySwaps(long *panelp, long start, long end, const Vec<long>& P)
	6658	{
	6659	for (long i = start; i < end; i++) {
	6660	long pos = P[i];
	6661	if (pos != i)
	6662	SwapOneRow(panelp, i, pos);
	6663	}
	6664	}
	6665
	6666
	6667	static inline
	6668	void MulAddBlock(long x, const long y, const long *z,
	6669	long p, sp_ll_reduce_struct ll_red_struct)
	6670	{
	6671	// x += y*z
	6672
	6673	muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ, p, ll_red_struct);
	6674	}
	6675
	6676
	6677
	6678	static
	6679	long elim_blk_LL(const mat_zz_p& A, mat_zz_p im, mat_zz_p ker,
	6680	long w, bool full)
	6681	{
	6682	long n = A.NumRows();
	6683	long m = A.NumCols();
	6684
	6685	if (w < 0 \|\| w > m) LogicError("elim: bad args");
	6686
	6687	// take care of corner cases
	6688	if (n == 0) {
	6689	if (im) im->SetDims(0, m);
	6690	if (ker) ker->SetDims(0, 0);
	6691	return 0;
	6692	}
	6693
	6694	if (w == 0) {
	6695	if (im) {
	6696	if (full)
	6697	(*im) = A;
	6698	else
	6699	im->SetDims(0, m);
	6700	}
	6701	if (ker) ident(*ker, n);
	6702	return 0;
	6703	}
	6704
	6705	if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	6706	if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
	6707
	6708	long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6709
	6710
	6711	Vec< UniqueArray<long> > M;
	6712	M.SetLength(npanels);
	6713	for (long panel = 0; panel < npanels; panel++) {
	6714	M[panel].SetLength(n*MAT_BLK_SZ);
	6715	long *panelp = &M[panel][0];
	6716
	6717	for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
	6718	}
	6719
	6720	// copy A into panels
	6721	for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
	6722	long j_max = min(jj+MAT_BLK_SZ, m);
	6723	long *panelp = &M[panel][0];
	6724
	6725	for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
	6726	const zz_p *ap = A[i].elts() + jj;
	6727
	6728	for (long j = jj; j < j_max; j++)
	6729	panelp[j-jj] = rep(ap[j-jj]);
	6730	}
	6731	}
	6732
	6733	UniqueArray<long> aux_panel_store;
	6734	aux_panel_store.SetLength(n*MAT_BLK_SZ);
	6735	long * NTL_RESTRICT aux_panel = &aux_panel_store[0];
	6736
	6737
	6738	UniqueArray<long> buf_store1;
	6739	buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	6740	long *buf1 = &buf_store1[0];
	6741
	6742	Vec<long> P;
	6743	P.SetLength(n);
	6744	for (long k = 0; k < n; k++) P[k] = k;
	6745	// records swap operations
	6746
	6747	Vec<long> pcol;
	6748	pcol.SetLength(n);
	6749	// pcol[i] records pivot columns for row i
	6750
	6751	long p = zz_p::modulus();
	6752	mulmod_t pinv = zz_p::ModulusInverse();
	6753	sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
	6754
	6755	bool pivoting = false;
	6756
	6757	long r = 0, rr = 0, k = 0, kk = 0;
	6758	long rpanel = 0, kpanel = 0;
	6759
	6760	while (k < w) {
	6761
	6762	if (r > rr && ker) {
	6763	// we have a panel from a previous iteration
	6764	// we store enough of it to facilitate the kernel
	6765	// computation later. At this point, we have
	6766	// r == rr+INV_BLK_SIZE, and it suffices to store
	6767	// rows [r..n) into M[rpanel], and this will not
	6768	// overwrite anything useful in M[rpanel]
	6769
	6770	long *panelp = &M[rpanel][0];
	6771	for (long h = rMAT_BLK_SZ; h < nMAT_BLK_SZ; h++) {
	6772	panelp[h] = aux_panel[h];
	6773	}
	6774
	6775	rpanel++;
	6776	}
	6777
	6778	rr = r;
	6779
	6780	for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
	6781
	6782	for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
	6783
	6784	if (k == kk+MAT_BLK_SZ) { // start new kpanel
	6785	kk = k;
	6786	kpanel++;
	6787	}
	6788
	6789	long * NTL_RESTRICT kpanelp = &M[kpanel][0];
	6790
	6791	if (k == kk) { // a fresh kpanel -- special processing
	6792
	6793
	6794	if (r > rr) {
	6795
	6796
	6797	// apply current sequence of permutations
	6798
	6799	ApplySwaps(kpanelp, rr, r, P);
	6800
	6801	// copy rows [rr..r) of kpanel into buf1
	6802	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	6803	buf1[i] = kpanelp[rr*MAT_BLK_SZ+i];
	6804
	6805	TransposeBlock(buf1, 0);
	6806
	6807	// kpanel[rr..n) += aux_panel[rr..n)*buf1
	6808
	6809	muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr, p, ll_red_struct);
	6810	}
	6811	}
	6812
	6813	long pos = -1;
	6814	long pivot;
	6815	long pivot_inv;
	6816	for (long i = r; i < n; i++) {
	6817	pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
	6818	kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
	6819
	6820	if (pivot != 0) {
	6821	pivot_inv = InvMod(pivot, p);
	6822	pos = i;
	6823	break;
	6824	}
	6825	}
	6826
	6827	if (pos == -1) {
	6828	continue;
	6829	}
	6830
	6831	long * NTL_RESTRICT y = &kpanelp[r*MAT_BLK_SZ];
	6832	long * NTL_RESTRICT y1 = &aux_panel[r*MAT_BLK_SZ];
	6833	if (r != pos) {
	6834	// swap rows pos and r
	6835	long * NTL_RESTRICT x = &kpanelp[pos*MAT_BLK_SZ];
	6836	long * NTL_RESTRICT x1 = &aux_panel[pos*MAT_BLK_SZ];
	6837
	6838	for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
	6839	for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
	6840
	6841	P[r] = pos;
	6842	pivoting = true;
	6843	}
	6844
	6845	// clear column
	6846	for (long i = r+1; i < n; i++) {
	6847	long * NTL_RESTRICT x = &kpanelp[i*MAT_BLK_SZ];
	6848	long * NTL_RESTRICT x1 = &aux_panel[i*MAT_BLK_SZ];
	6849	long t1 = x[k-kk];
	6850	t1 = MulMod(t1, pivot_inv, p);
	6851	t1 = NegateMod(t1, p);
	6852	x[k-kk] = 0;
	6853	x1[r-rr] = t1;
	6854	if (t1 == 0) continue;
	6855
	6856	// add t1 * row r to row i
	6857	long ut1 = t1;
	6858	mulmod_precon_t ut1_pinv = PrepMulModPrecon(ut1, p, pinv);
	6859
	6860	for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
	6861	x[j] = AddMod(x[j], MulModPrecon(y[j], ut1, p, ut1_pinv), p);
	6862	for (long j = 0; j < r-rr; j++)
	6863	x1[j] = AddMod(x1[j], MulModPrecon(y1[j], ut1, p, ut1_pinv), p);
	6864	}
	6865
	6866	pcol[r] = k;
	6867	r++;
	6868	}
	6869
	6870	if (r > rr) {
	6871
	6872	// we have a panel
	6873
	6874	bool seq =
	6875	double(npanels-(kpanel+1))double(n-rr)double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
	6876
	6877	// apply aux_panel to remaining panels: [kpanel+1..npanels)
	6878	NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
	6879	NTL_IMPORT(p)
	6880	NTL_IMPORT(n)
	6881	NTL_IMPORT(ll_red_struct)
	6882	NTL_IMPORT(aux_panel)
	6883	NTL_IMPORT(rr)
	6884	NTL_IMPORT(r)
	6885
	6886
	6887	UniqueArray<long> buf_store;
	6888	buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
	6889	long *buf = &buf_store[0];
	6890
	6891
	6892	for (long index = first; index < last; index++) {
	6893	long jpanel = index + kpanel+1;
	6894
	6895	long * NTL_RESTRICT jpanelp = &M[jpanel][0];
	6896
	6897	// perform swaps
	6898	ApplySwaps(jpanelp, rr, r, P);
	6899
	6900	// copy rows [rr..r) of jpanel into buf
	6901	for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
	6902	buf[i] = jpanelp[rr*MAT_BLK_SZ+i];
	6903
	6904	TransposeBlock(buf, 0);
	6905
	6906	// jpanel[rr..n) += aux_panel[rr..n)*buf
	6907
	6908	muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr, p, ll_red_struct);
	6909	}
	6910
	6911	NTL_GEXEC_RANGE_END
	6912
	6913	}
	6914
	6915	}
	6916
	6917	if (im) {
	6918	mat_zz_p& Im = *im;;
	6919	if (full)
	6920	Im.SetDims(n, m);
	6921	else
	6922	Im.SetDims(r, m);
	6923
	6924	for (long i = 0; i < r; i++) {
	6925	long pc = pcol[i];
	6926	for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
	6927	for (long j = pc; j < m; j++) {
	6928	long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6929	Im[i][j].LoopHole() = t0;
	6930	}
	6931	}
	6932
	6933	if (full) {
	6934	for (long i = r; i < n; i++) {
	6935	for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
	6936	for (long j = w; j < m; j++) {
	6937	long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	6938	Im[i][j].LoopHole() = t0;
	6939	}
	6940	}
	6941	}
	6942	}
	6943
	6944	if (ker) {
	6945	mat_zz_p& Ker = *ker;
	6946	Ker.SetDims(n-r, n);
	6947	if (r < n) {
	6948
	6949	long start_block = r/MAT_BLK_SZ;
	6950	long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6951	long vblocks = end_block-start_block;
	6952	long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6953
	6954	Vec< UniqueArray<long> > kerbuf;
	6955	kerbuf.SetLength(vblocks);
	6956	for (long i = 0; i < vblocks; i++)
	6957	kerbuf[i].SetLength(hblocksMAT_BLK_SZMAT_BLK_SZ);
	6958
	6959	long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
	6960
	6961	// if r > rr, we have a panel sitting in
	6962	// aux_panel, which may or may not be a full panel
	6963
	6964	long *initial_panel = 0;
	6965	if (r > rr) {
	6966	initial_panel = aux_panel;
	6967	}
	6968	else {
	6969	initial_panel = &M[hblocks-1][0];
	6970	}
	6971
	6972	for (long vb = start_block; vb < end_block; vb++)
	6973	CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
	6974
	6975	for (long hb = hblocks-2; hb >= 0; hb--) {
	6976
	6977	ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
	6978
	6979	for (long b = hb+1; b < end_block; b++) {
	6980	CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
	6981	TransposeBlock(&M[hb][0], b-1);
	6982	}
	6983	}
	6984
	6985	bool seq = double(n-r)double(r)double(r)/2 < PAR_THRESH;
	6986
	6987
	6988	NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
	6989	NTL_IMPORT(p)
	6990	NTL_IMPORT(ll_red_struct)
	6991	NTL_IMPORT(hblocks)
	6992
	6993	for (long index = first; index < last; index++) {
	6994	long vb = index + start_block;
	6995	long *kerbufp = &kerbuf[vb-start_block][0];
	6996
	6997	for (long hb = hblocks-2; hb >= 0; hb--) {
	6998	long *colbuf = &M[hb][0];
	6999	long acc = &kerbufp[hbMAT_BLK_SZ*MAT_BLK_SZ];
	7000
	7001	CopyBlock(acc, 0, colbuf, vb-1);
	7002	TransposeBlock(acc, 0);
	7003
	7004	for (long b = hb+1; b < hblocks; b++) {
	7005	MulAddBlock(acc, &kerbufp[bMAT_BLK_SZMAT_BLK_SZ],
	7006	&colbuf[(b-1)MAT_BLK_SZMAT_BLK_SZ], p, ll_red_struct);
	7007	}
	7008	}
	7009	}
	7010
	7011	NTL_GEXEC_RANGE_END
	7012
	7013	for (long i = r; i < n; i++) {
	7014
	7015	long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
	7016
	7017	for (long j = 0; j < r; j++) {
	7018	long t0 =
	7019	kerbufp[(j/MAT_BLK_SZ)MAT_BLK_SZMAT_BLK_SZ+
	7020	(i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
	7021
	7022	Ker[i-r][j].LoopHole() = long(t0);
	7023	}
	7024	}
	7025
	7026	for (long i = 0; i < n-r; i++) {
	7027	for (long j = 0; j < n-r; j++) {
	7028	Ker[i][j+r].LoopHole() = 0;
	7029	}
	7030	Ker[i][i+r].LoopHole() = 1;
	7031	}
	7032
	7033	if (pivoting) {
	7034	for (long i = 0; i < n-r; i++) {
	7035	zz_p *x = Ker[i].elts();
	7036
	7037	for (long k = n-1; k >= 0; k--) {
	7038	long pos = P[k];
	7039	if (pos != k) swap(x[pos], x[k]);
	7040	}
	7041	}
	7042	}
	7043	}
	7044	}
	7045
	7046	return r;
	7047
	7048	}
	7049
	7050
	7051	#endif
	7052
	7053
	7054
	7055	static
	7056	long elim(const mat_zz_p& A, mat_zz_p im, mat_zz_p ker, long w, bool full)
	7057	{
	7058	long n = A.NumRows();
	7059	long m = A.NumCols();
	7060
	7061	if (w < 0 \|\| w > m) LogicError("elim: bad args");
	7062
	7063	#ifndef NTL_HAVE_LL_TYPE
	7064
	7065	return elim_basic(A, im, ker, w, full);
	7066
	7067	#else
	7068
	7069	long p = zz_p::modulus();
	7070
	7071	if (n/MAT_BLK_SZ < 4 \|\| w/MAT_BLK_SZ < 4) {
	7072	return elim_basic(A, im, ker, w, full);
	7073	}
	7074	else {
	7075	long V = 4*MAT_BLK_SZ;
	7076
	7077	#ifdef NTL_HAVE_AVX
	7078	if (p-1 <= MAX_DBL_INT &&
	7079	V <= (MAX_DBL_INT-(p-1))/(p-1) &&
	7080	V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
	7081
	7082	return elim_blk_DD(A, im, ker, w, full);
	7083	}
	7084	else
	7085	#endif
	7086	if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
	7087	cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
	7088
	7089	return elim_blk_L(A, im, ker, w, full);
	7090
	7091	}
	7092	else {
	7093
	7094	return elim_blk_LL(A, im, ker, w, full);
	7095	}
	7096
	7097	}
	7098
	7099	#endif
	7100
	7101
	7102
	7103	}
	7104
	7105
	7106	// ******************************************************************
	7107	//
	7108	// High level interfaces
	7109	//
	7110	// ******************************************************************
	7111
	7112
	7113
	7114	long gauss(mat_zz_p& M, long w)
	7115	{
	7116	return elim(M, &M, 0, w, true);
	7117	}
	7118
	7119
	7120	long gauss(mat_zz_p& M)
	7121	{
	7122	return gauss(M, M.NumCols());
	7123	}
	7124
	7125	void image(mat_zz_p& X, const mat_zz_p& A)
	7126	{
	7127	elim(A, &X, 0, A.NumCols(), false);
	7128	}
	7129
	7130	void kernel(mat_zz_p& X, const mat_zz_p& A)
	7131	{
	7132	elim(A, 0, &X, A.NumCols(), false);
	7133	}
	7134
	7135
	7136	// ******************************************************************
	7137	//
	7138	// Operator/functional notation
	7139	//
	7140	// ******************************************************************
	7141
	7142
	7143
	7144
	7145	mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
	7146	{
	7147	mat_zz_p res;
	7148	add(res, a, b);
	7149	NTL_OPT_RETURN(mat_zz_p, res);
	7150	}
	7151
	7152	mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
	7153	{
	7154	mat_zz_p res;
	7155	mul_aux(res, a, b);
	7156	NTL_OPT_RETURN(mat_zz_p, res);
	7157	}
	7158
	7159	mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
	7160	{
	7161	mat_zz_p res;
	7162	sub(res, a, b);
	7163	NTL_OPT_RETURN(mat_zz_p, res);
	7164	}
	7165
	7166
	7167	mat_zz_p operator-(const mat_zz_p& a)
	7168	{
	7169	mat_zz_p res;
	7170	negate(res, a);
	7171	NTL_OPT_RETURN(mat_zz_p, res);
	7172	}
	7173
	7174
	7175	vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
	7176	{
	7177	vec_zz_p res;
	7178	mul_aux(res, a, b);
	7179	NTL_OPT_RETURN(vec_zz_p, res);
	7180	}
	7181
	7182	vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
	7183	{
	7184	vec_zz_p res;
	7185	mul(res, a, b);
	7186	NTL_OPT_RETURN(vec_zz_p, res);
	7187	}
	7188
	7189
948	7190	NTL_END_IMPL

+23

-5

src/mat_lzz_pE.c less more

316	316	}
317	317
318	318
319		void solve(zz_pE& d, vec_zz_pE& X,
320		const mat_zz_pE& A, const vec_zz_pE& b)
	319	static
	320	void solve_impl(zz_pE& d, vec_zz_pE& X, const mat_zz_pE& A, const vec_zz_pE& b, bool trans)
321	321
322	322	{
323	323	long n = A.NumRows();

346	346
347	347	for (i = 0; i < n; i++) {
348	348	M[i].SetLength(n+1);
349		for (j = 0; j < n; j++) {
350		M[i][j].rep.SetMaxLength(2*deg(p)-1);
351		M[i][j] = rep(A[j][i]);
	349	if (trans) {
	350	for (j = 0; j < n; j++) {
	351	M[i][j].rep.SetMaxLength(2*deg(p)-1);
	352	M[i][j] = rep(A[j][i]);
	353	}
	354	}
	355	else {
	356	for (j = 0; j < n; j++) {
	357	M[i][j].rep.SetMaxLength(2*deg(p)-1);
	358	M[i][j] = rep(A[i][j]);
	359	}
352	360	}
353	361	M[i][n].rep.SetMaxLength(2*deg(p)-1);
354	362	M[i][n] = rep(b[i]);

418	426	}
419	427
420	428	conv(d, det);
	429	}
	430
	431	void solve(zz_pE& d, vec_zz_pE& x, const mat_zz_pE& A, const vec_zz_pE& b)
	432	{
	433	solve_impl(d, x, A, b, true);
	434	}
	435
	436	void solve(zz_pE& d, const mat_zz_pE& A, vec_zz_pE& x, const vec_zz_pE& b)
	437	{
	438	solve_impl(d, x, A, b, false);
421	439	}
422	440
423	441	void inv(zz_pE& d, mat_zz_pE& X, const mat_zz_pE& A)

+269

-0

src/mat_lzz_pTest.c less more

	0
	1	#include <NTL/mat_lzz_p.h>
	2
	3	NTL_CLIENT
	4
	5
	6
	7	void FillRandom(Mat<zz_p>& A)
	8	{
	9	long n = A.NumRows();
	10	long m = A.NumCols();
	11	for (long i = 0; i < n; i++)
	12	for (long j = 0; j < m; j++)
	13	random(A[i][j]);
	14	}
	15
	16	void FillRandom1(Mat<zz_p>& A)
	17	{
	18	long n = A.NumRows();
	19	long m = A.NumCols();
	20	for (long j = 0; j < m; j++) {
	21	if (j > 0 && RandomBnd(2)) {
	22	for (long i = 0; i < n; i++)
	23	A[i][j] = A[i][j-1];
	24	}
	25	else {
	26	for (long i = 0; i < n; i++)
	27	random(A[i][j]);
	28	}
	29	}
	30	}
	31
	32	void FillRandom(Vec<zz_p>& A)
	33	{
	34	long n = A.length();
	35	for (long i = 0; i < n; i++)
	36	random(A[i]);
	37	}
	38
	39	long old_gauss(mat_zz_p& M, long w)
	40	{
	41	using NTL_NAMESPACE::negate;
	42	long k, l;
	43	long i, j;
	44	long pos;
	45	zz_p t1, t2, t3;
	46	zz_p x, y;
	47
	48	long n = M.NumRows();
	49	long m = M.NumCols();
	50
	51	if (w < 0 \|\| w > m)
	52	LogicError("gauss: bad args");
	53
	54	long p = zz_p::modulus();
	55	mulmod_t pinv = zz_p::ModulusInverse();
	56	long T1, T2;
	57
	58	l = 0;
	59	for (k = 0; k < w && l < n; k++) {
	60
	61	pos = -1;
	62	for (i = l; i < n; i++) {
	63	if (!IsZero(M[i][k])) {
	64	pos = i;
	65	break;
	66	}
	67	}
	68
	69	if (pos != -1) {
	70	swap(M[pos], M[l]);
	71
	72	inv(t3, M[l][k]);
	73	negate(t3, t3);
	74
	75	for (i = l+1; i < n; i++) {
	76	// M[i] = M[i] + M[l]M[i,k]t3
	77
	78	mul(t1, M[i][k], t3);
	79
	80	T1 = rep(t1);
	81	mulmod_precon_t T1pinv = PrepMulModPrecon(T1, p, pinv);
	82
	83	clear(M[i][k]);
	84
	85	x = M[i].elts() + (k+1);
	86	y = M[l].elts() + (k+1);
	87
	88	for (j = k+1; j < m; j++, x++, y++) {
	89	// x = x + (y)t1
	90
	91	T2 = MulModPrecon(rep(*y), T1, p, T1pinv);
	92	T2 = AddMod(T2, rep(*x), p);
	93	(*x).LoopHole() = T2;
	94	}
	95	}
	96
	97	l++;
	98	}
	99	}
	100
	101	return l;
	102	}
	103
	104	long old_gauss(mat_zz_p& M)
	105	{
	106	return old_gauss(M, M.NumCols());
	107	}
	108
	109	void old_image(mat_zz_p& X, const mat_zz_p& A)
	110	{
	111	mat_zz_p M;
	112	M = A;
	113	long r = old_gauss(M);
	114	M.SetDims(r, M.NumCols());
	115	X = M;
	116	}
	117
	118	int main(int argc, char **argv)
	119	{
	120	long iters = 100;
	121
	122
	123	#if 1
	124	cerr << "testing multiplication";
	125	for (long cnt = 0; cnt < iters; cnt++) {
	126	cerr << ".";
	127
	128	long bnd = (cnt%2) ? 25 : 2000;
	129
	130	long len = RandomBnd(NTL_SP_NBITS-3)+4;
	131	long n = RandomBnd(bnd);
	132	long l = RandomBnd(bnd);
	133	long m = RandomBnd(bnd);
	134
	135	long p = RandomPrime_long(len);
	136	zz_p::init(p);
	137
	138	Mat<zz_p> A, B, X;
	139
	140	A.SetDims(n, l);
	141	B.SetDims(l, m);
	142
	143	FillRandom(A);
	144	FillRandom(B);
	145
	146	X.SetDims(n, m);
	147
	148	vec_zz_p R;
	149
	150	R.SetLength(m);
	151	for (long i = 0; i < m; i++) random(R[i]);
	152
	153	mul(X, A, B);
	154
	155	if (XR != A(B*R))
	156	cerr << "\n\n\n\n\n********** oops " << len << " " << n << " " << l << " "
	157	<< m << "\n";
	158	}
	159	#endif
	160
	161	#if 1
	162	cerr << "\ntesting inversion";
	163	for (long cnt = 0; cnt < iters; cnt++) {
	164	cerr << ".";
	165	long bnd = (cnt%2) ? 25 : 1500;
	166
	167	long len = RandomBnd(NTL_SP_NBITS-3)+4;
	168	long n = RandomBnd(bnd);
	169
	170	long p = RandomPrime_long(len);
	171	zz_p::init(p);
	172
	173	Mat<zz_p> A, X;
	174
	175	A.SetDims(n, n);
	176
	177	FillRandom(A);
	178
	179
	180	vec_zz_p R;
	181
	182	R.SetLength(n);
	183	for (long i = 0; i < n; i++) random(R[i]);
	184
	185	zz_p d;
	186
	187	inv(d, X, A);
	188
	189	if (d != 0) {
	190	if (R != A(XR))
	191	cerr << "\n\n\n\n\n*********** oops " << len << " " << n << "\n";
	192	}
	193	else {
	194	cerr << "[singular]";
	195	}
	196	}
	197	#endif
	198
	199	#if 1
	200	cerr << "\ntesting solve";
	201	for (long cnt = 0; cnt < iters; cnt++) {
	202	cerr << ".";
	203	long bnd = (cnt%2) ? 25 : 2000;
	204
	205	long len = RandomBnd(NTL_SP_NBITS-3)+4;
	206	long n = RandomBnd(bnd);
	207
	208	long p = RandomPrime_long(len);
	209	zz_p::init(p);
	210
	211	Mat<zz_p> A;
	212
	213	A.SetDims(n, n);
	214	FillRandom(A);
	215
	216	Vec<zz_p> x, b;
	217	b.SetLength(n);
	218	FillRandom(b);
	219
	220	zz_p d;
	221
	222	solve(d, A, x, b);
	223
	224	if (d != 0) {
	225	if (A*x != b)
	226	cerr << "\n\n\n\n\n*********** oops " << len << " " << n << "\n";
	227	}
	228	else {
	229	cerr << "[singular]";
	230	}
	231	}
	232	#endif
	233
	234	#if 1
	235	cerr << "\ntesting image and kernel";
	236	for (long cnt = 0; cnt < iters; cnt++) {
	237	cerr << ".";
	238	long bnd = (cnt%2) ? 25 : 1500;
	239
	240	long len = RandomBnd(NTL_SP_NBITS-3)+4;
	241	long n = RandomBnd(bnd);
	242	long m = RandomBnd(bnd);
	243
	244	long p = RandomPrime_long(len);
	245	zz_p::init(p);
	246
	247	Mat<zz_p> A;
	248
	249	A.SetDims(n, m);
	250	FillRandom1(A);
	251
	252	Mat<zz_p> im, im1, ker1;
	253
	254	old_image(im, A);
	255	image(im1, A);
	256	kernel(ker1, A);
	257
	258
	259	if (im != im1 \|\| !IsZero(ker1*A) \|\| im1.NumRows() + ker1.NumRows() != n) {
	260	cerr << "\n\n\n\n\n*********** oops " << len << " " << n << m << "\n";
	261	}
	262	}
	263	#endif
	264
	265	cerr << "\n";
	266
	267	}
	268

+45

-31

src/mfile less more

12	12
13	13	CXXFLAGS=@{CXXFLAGS}
14	14	# Flags for the C++ compiler
	15
	16	CXXAUTOFLAGS=@{CXXAUTOFLAGS}
	17	# Flags for the C++ compiler, automatically generated by configuration script
15	18
16	19
17	20	AR=@{AR}

136	139	O16=$(O15)
137	140	O17=$(O16)
138	141	O18=$(O17) xdouble.o
139		O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o
	142	O19=$(O18) G_LLL_FP.o G_LLL_QP.o G_LLL_XD.o G_LLL_RR.o thread.o BasicThreadPool.o
140	143
141	144	OBJ=$(O19)
142	145

161	164	S16=$(S15)
162	165	S17=$(S16)
163	166	S18=$(S17) xdouble.c
164		S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c
	167	S19=$(S18) G_LLL_FP.c G_LLL_QP.c G_LLL_XD.c G_LLL_RR.c thread.c BasicThreadPool.c
165	168
166	169	SRC = $(S19)
167	170

193	196	IN16=$(IN15) vec_vec_ZZ_p.h vec_vec_ZZ_pE.h vec_vec_long.h vec_vec_lzz_p.h
194	197	IN17=$(IN16) vec_vec_lzz_pE.h vec_xdouble.h xdouble.h config.h version.h
195	198	IN18=$(IN17) def_config.h new.h vec_ulong.h vec_vec_ulong.h c_lip.h g_lip.h
196		IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h
197		IN20=$(IN19) have_LL_no.h have_LL_yes.h have_builtin_clzl_no.h have_builtin_clzl_yes.h
198
199		INCL=$(IN20)
	199	IN19=$(IN18) SmartPtr.h Lazy.h LazyTable.h thread.h BasicThreadPool.h
	200	INCL=$(IN19)
200	201
201	202
202	203

212	213	# test source files
213	214
214	215	TS1=QuickTest.c BerlekampTest.c CanZassTest.c ZZXFacTest.c MoreFacTest.c LLLTest.c
215		TS2=$(TS1) subset.c MatrixTest.c CharPolyTest.c RRTest.c QuadTest.c
	216	TS2=$(TS1) subset.c MatrixTest.c mat_lzz_pTest.c CharPolyTest.c RRTest.c QuadTest.c
216	217	TS3=$(TS2) GF2XTest.c GF2EXTest.c BitMatTest.c ZZ_pEXTest.c lzz_pEXTest.c Timing.c
217	218	TS4=$(TS3) ThreadTest.c ExceptionTest.c
218	219	TS = $(TS4)
219	220
220	221	# scripts
221	222
222		SCRIPTS1=MakeGetTime MakeGetPID MakeCheckCLZL MakeCheckLL TestScript dosify unixify RemoveProg
	223	SCRIPTS1=MakeGetTime MakeGetPID MakeCheckFeature ResetFeatures CopyFeatures TestScript dosify unixify RemoveProg
223	224	SCRIPTS2=$(SCRIPTS1) configure DoConfig mfile cfile ppscript
224	225
225	226	SCRIPTS=$(SCRIPTS2)
226	227
227	228	# auxilliary source
228	229
229		MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c CheckPCLMUL.c
230		GT=GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
	230	MD=MakeDesc.c MakeDescAux.c newnames.c gen_gmp_aux.c
	231	GT=GetTime0.c GetTime1.c GetTime2.c GetTime3.c GetTime4.c GetTime5.c TestGetTime.c
231	232	GP=GetPID1.c GetPID2.c TestGetPID.c
232		CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c
	233	CH=CheckCLZL.c CheckCLZLAux.c CheckLL.c CheckLLAux.c CheckAVX.c CheckFMA.c CheckCompile.c
	234
	235	AUXPROGS = TestGetTime TestGetPID CheckFeature CheckCompile
233	236
234	237
235	238
236	239	# documentation
237	240
238	241
239		D01=copying.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
	242	D01=copying.txt BasicThreadPool.txt GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt
240	243	D02=$(D01) GF2XFactoring.txt GF2XVec.txt HNF.txt Lazy.txt LazyTable.txt LLL.txt RR.txt SmartPtr.txt
241	244	D03=$(D02) ZZ.txt ZZVec.txt ZZX.txt ZZXFactoring.txt ZZ_p.txt ZZ_pE.txt
242	245	D04=$(D03) ZZ_pEX.txt ZZ_pEXFactoring.txt ZZ_pX.txt ZZ_pXFactoring.txt

252	255	D14=$(D13) tour-modules.html tour-unix.html tour-examples.html
253	256	D15=$(D14) tour-roadmap.html tour-win.html tour-impl.html tour-struct.html
254	257	D16=$(D15) tour.html tour-ex1.html tour-ex2.html tour-ex3.html tour-ex4.html
255		D17=$(D16) tour-ex5.html tour-ex6.html arrow1.gif arrow2.gif arrow3.gif
	258	D17=$(D16) tour-ex5.html tour-ex6.html tour-ex7.html arrow1.gif arrow2.gif arrow3.gif
256	259	D18=$(D17) tour-gmp.html tour-gf2x.html tour-tips.html config.txt version.txt
257	260
258	261	TX01=GF2.txt GF2E.txt GF2EX.txt GF2EXFactoring.txt GF2X.txt GF2XFactoring.txt

263	266	TX06=mat_ZZ_pE.txt mat_lzz_p.txt mat_lzz_pE.txt mat_poly_ZZ.txt mat_poly_ZZ_p.txt
264	267	TX07=mat_poly_lzz_p.txt matrix.txt pair.txt quad_float.txt tools.txt vec_GF2.txt
265	268	TX08=vec_GF2E.txt vec_RR.txt vec_ZZ.txt vec_ZZ_p.txt vec_ZZ_pE.txt vec_lzz_p.txt
266		TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt
	269	TX09=vec_lzz_pE.txt vector.txt version.txt xdouble.txt BasicThreadPool.txt
267	270
268	271	TXFILES=$(TX01) $(TX02) $(TX03) $(TX04) $(TX05) $(TX06) $(TX07) $(TX08) $(TX09)
269	272

275	278	HT06=mat_ZZ_pE.cpp.html mat_lzz_p.cpp.html mat_lzz_pE.cpp.html mat_poly_ZZ.cpp.html mat_poly_ZZ_p.cpp.html
276	279	HT07=mat_poly_lzz_p.cpp.html matrix.cpp.html pair.cpp.html quad_float.cpp.html tools.cpp.html vec_GF2.cpp.html
277	280	HT08=vec_GF2E.cpp.html vec_RR.cpp.html vec_ZZ.cpp.html vec_ZZ_p.cpp.html vec_ZZ_pE.cpp.html vec_lzz_p.cpp.html
278		HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html
	281	HT09=vec_lzz_pE.cpp.html vector.cpp.html version.cpp.html xdouble.cpp.html BasicThreadPool.cpp.html
279	282
280	283	HTFILES=$(HT01) $(HT02) $(HT03) $(HT04) $(HT05) $(HT06) $(HT07) $(HT08) $(HT09)
281	284

287	290	# test program executables
288	291
289	292	PROG1=QuickTest BerlekampTest CanZassTest ZZXFacTest MoreFacTest LLLTest BitMatTest
290		PROG2=$(PROG1) MatrixTest CharPolyTest RRTest QuadTest
	293	PROG2=$(PROG1) MatrixTest mat_lzz_pTest CharPolyTest RRTest QuadTest
291	294	PROG3=$(PROG2) GF2XTest GF2EXTest subset ZZ_pEXTest lzz_pEXTest Timing ThreadTest
292	295	PROGS = $(PROG3)
293	296
294	297	# things to save to a tar file
295	298
296	299	SFI1=makefile $(SRC) $(SINC) $(SCRIPTS) $(MD) $(GT) $(GP) $(CH) $(TS) $(TD) mach_desc.win
297		SFI2=$(SFI1) MulTimeTest.c PolyTimeTest.c Poly1TimeTest.c GF2XTimeTest.c
	300	SFI2=$(SFI1) MulTimeTest.c Poly1TimeTest.c Poly2TimeTest.c Poly3TimeTest.c GF2XTimeTest.c
298	301	SFI3=$(SFI2) InitSettings.c DispSettings.c WizardAux Wizard def_makefile
299	302	SFILES=$(SFI3)
300	303

309	312	NTL_INCLUDE = -I../include -I.
310	313	# NTL needs this to find its include files
311	314
312		COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) -c
313
314		LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
	315	COMPILE = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) -c
	316
	317	LINK = $(CXX) $(NTL_INCLUDE) $(CPPFLAGS) $(CXXAUTOFLAGS) $(CXXFLAGS) $(LDFLAGS)
315	318
316	319
317	320

341	344	# setup2 does some dynamic checks for GetTime, GetPID, __builtin_clzl, and LL types
342	345
343	346	setup2:
	347	echo "* CheckFeature log *" > CheckFeature.log
344	348	sh MakeGetTime "$(LINK)" "$(LDLIBS)"
345	349	sh MakeGetPID "$(LINK)" "$(LDLIBS)"
346		sh MakeCheckCLZL "$(LINK)" "$(LDLIBS)"
347		sh MakeCheckLL "$(LINK)" "$(LDLIBS)"
	350	sh MakeCheckFeature BUILTIN_CLZL "CheckCLZL.c CheckCLZLAux.c" "$(LINK)" "$(LDLIBS)"
	351	sh MakeCheckFeature LL_TYPE "CheckLL.c CheckLLAux.c" "$(LINK)" "$(LDLIBS)"
	352	sh MakeCheckFeature AVX "CheckAVX.c" "$(LINK)" "$(LDLIBS)"
	353	sh MakeCheckFeature FMA "CheckFMA.c" "$(LINK)" "$(LDLIBS)"
348	354
349	355	# setup3 generates the file ../include/NTL/gmp_aux.h
350	356	# The file ../include/NTL/gmp_aux.h is included in ../include/NTL/lip.h

382	388	GetPID.o: GetPID.c
383	389	$(LCOMP) $(COMPILE) GetPID.c
384	390
385		CheckPCLMUL: CheckPCLMUL.c
386		$(LINK) -o CheckPCLMUL CheckPCLMUL.c $(LDLIBS)
	391	CheckCompile: CheckCompile.c
	392	$(LINK) -o CheckCompile CheckCompile.c $(LDLIBS)
	393
387	394
388	395	.c.o:
389	396	$(LCOMP) $(COMPILE) $(GF2X_OPT_INCDIR) $<

460	467
461	468	clobber:
462	469	rm -f ntl.a mach_desc.h ../include/NTL/mach_desc.h GetTime.c GetPID.c
463		cp ../include/NTL/have_LL_no.h ../include/NTL/have_LL.h
464		cp ../include/NTL/have_builtin_clzl_no.h ../include/NTL/have_builtin_clzl.h
	470	sh ResetFeatures '..'
465	471	rm -f ../include/NTL/gmp_aux.h
466		sh RemoveProg $(PROGS) MakeDesc TestGetTime TestGetPID gen_gmp_aux
	472	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
467	473	rm -f *.o
468	474	rm -rf small
469	475	rm -f cfileout mfileout

471	477	rm -f all
472	478
473	479	clean:
474		sh RemoveProg MakeDesc TestGetTime TestGetPID gen_gmp_aux
	480	sh RemoveProg $(PROGS) MakeDesc $(AUXPROGS) gen_gmp_aux
475	481	rm -f *.o
476	482	rm -rf small
477	483	@{LSHAR} - $(LIBTOOL) --mode=clean rm -f libntl.la *.lo #LSHAR

497	503
498	504
499	505	package:
	506	./configure --nowrite
	507	cp mfileout def_makefile
	508	cp cfileout ../include/NTL/def_config.h
500	509	sh unixify "$(SFILES) DIRNAME WINDIR VERSION_INFO NOTES" "$(INCL)" "$(DOC)"
501	510	rm -rf `cat DIRNAME`
502	511	rm -f `cat DIRNAME`.tar

508	517	rm -rf `cat DIRNAME`
509	518
510	519	winpack:
	520	./configure --nowrite NTL_GMP_LIP=off
	521	cp mfileout def_makefile
	522	cp cfileout ../include/NTL/def_config.h
511	523	sh dosify "$(SRC)" "$(INCL)" "$(DOC)" "$(TS)" "$(TD)" "$(SINC)"
512	524	rm -rf `cat WINDIR`
513	525	rm -f `cat WINDIR`.zip

526	538
527	539	WO1 = FFT.o GetTime.o GetPID.o ctools.o ZZ.o ZZVec.o ZZ_p.o ZZ_pX.o
528	540	WO2 = $(WO1) ZZ_pX1.o lip.o tools.o vec_ZZ.o vec_ZZ_p.o
529		WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o fileio.o
	541	WO3 = $(WO2) GF2.o WordVector.o vec_GF2.o GF2X.o GF2X1.o thread.o BasicThreadPool.o fileio.o
530	542
531	543	WOBJ = $(WO3)
532	544

538	550	MulTimeTest:
539	551	$(LINK) -o MulTimeTest MulTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
540	552
541		PolyTimeTest:
542		$(LINK) -o PolyTimeTest PolyTimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
543	553
544	554	Poly1TimeTest:
545	555	$(LINK) -o Poly1TimeTest Poly1TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	556	Poly2TimeTest:
	557	$(LINK) -o Poly2TimeTest Poly2TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
	558	Poly3TimeTest:
	559	$(LINK) -o Poly3TimeTest Poly3TimeTest.c wntl.a $(GMP_OPT_LIBDIR) $(GMP_OPT_LIB) $(LDLIBS)
546	560
547	561
548	562	GF2XTimeTest:

-1

src/ppscript less more

7	7	do
8	8	name=`basename $i .txt`
9	9	cp $name.txt $name.cpp
10		$VIM $name.cpp '+set nu!' '+TOhtml' '+:1,$s/.@anchor{$.$}.*/<a name="\1"><\/a>/' '+w' '+qa!'
	10	$VIM $name.cpp '+set nu!' '+let c_no_curly_error=1' '+syntax off' '+syntax on' '+TOhtml' '+:1,$s/.@anchor{$.$}.*/<a name="\1"><\/a>/' '+w' '+qa!'
11	11	done
12	12
13	13

+108

-71

src/quad_float.c less more

43	43	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
44	44	*/
45	45
	46
	47
	48	#ifdef __INTEL_COMPILER
	49	#pragma float_control(precise,on)
	50	#endif
	51
	52	// NOTE: the above will force the Intel compiler to adhere to
	53	// language standards, which it does not do by default
	54
46	55	#include <NTL/quad_float.h>
47	56	#include <NTL/RR.h>
48	57

112	121
113	122
114	123	#if (NTL_BITS_PER_LONG >= NTL_DOUBLE_PRECISION)
	124
	125
115	126	quad_float to_quad_float(long n)
116	127	{
117		START_FIX
118	128	DOUBLE xhi, xlo;
119		DOUBLE u, v;
120
121		xhi = double(n);
	129
	130	xhi = TrueDouble(n);
122	131
123	132	// Because we are assuming 2's compliment integer
124	133	// arithmetic, the following prevents long(xhi) from overflowing.
125	134
126	135	if (n > 0)
127		xlo = double(n+long(-xhi));
	136	xlo = TrueDouble(n+long(-xhi));
128	137	else
129		xlo = double(n-long(xhi));
	138	xlo = TrueDouble(n-long(xhi));
130	139
131	140	// renormalize...just to be safe
132
133		u = xhi + xlo;
134		v = xhi - u;
135		v = v + xlo;
136		END_FIX
137		return quad_float(u, v);
	141
	142	quad_float z;
	143	normalize(z, xhi, xlo);
	144	return z;
138	145	}
139	146
140	147	quad_float to_quad_float(unsigned long n)
141	148	{
142		START_FIX
143	149	DOUBLE xhi, xlo, t;
144		DOUBLE u, v;
145	150
146	151	const double bnd = double(1L << (NTL_BITS_PER_LONG-2))*4.0;
147	152
148		xhi = double(n);
	153	xhi = TrueDouble(n);
149	154
150	155	if (xhi >= bnd)
151	156	t = xhi - bnd;

154	159
155	160	// we use the "to_long" function here to be as portable as possible.
156	161	long llo = to_long(n - (unsigned long)(t));
157		xlo = double(llo);
158
159		// renormalize...just to be safe
160
161		u = xhi + xlo;
162		v = xhi - u;
163		v = v + xlo;
164		END_FIX
165		return quad_float(u, v);
	162	xlo = TrueDouble(llo);
	163
	164	quad_float z;
	165	normalize(z, xhi, xlo);
	166	return z;
166	167	}
167	168	#endif
168	169
169	170
170		NTL_THREAD_LOCAL
	171	NTL_CHEAP_THREAD_LOCAL
171	172	long quad_float::oprec = 10;
172	173
173	174	void quad_float::SetOutputPrecision(long p)

351	352	}
352	353
353	354
	355
	356	#if (NTL_FMA_DETECTED)
	357
	358	double quad_float_zero = 0;
	359
	360	static inline
	361	double Protect(double x) { return x + quad_float_zero; }
	362
	363	#else
	364
	365
	366	static inline
	367	double Protect(double x) { return x; }
	368
	369
	370	#endif
	371
	372	// NOTE: this is really sick: some compilers will issue FMA
	373	// (fused mul add) instructions which will break correctness.
	374	// C99 standard is supposed to prevent this across separate
	375	// statements, but C++ standard doesn't guarantee much at all.
	376	// In any case, gcc does not even implement the C99 standard
	377	// correctly. One could disable this by compiling with
	378	// an appropriate flag: -mno-fma works for gcc, while -no-fma works
	379	// for icc. icc and MSVC++ also support pragmas to do this:
	380	// #pragma fp_contract(off). There is also a compiler flag for
	381	// gcc: -ffp-contract=off, but -mno-fma seems more widely supported.
	382	// These flags work for clang, as well.
	383	//
	384	// But in any case, I'd rather not mess with getting these flags right.
	385	// By calling Protect(a*b), this has the effect of forcing the
	386	// compiler to compute a*b + 0. Assuming the compiler otherwise
	387	// does not perform any re-association, this should do the trick.
	388	// There is a small performance penalty, but it should be reasonable.
	389
	390
	391
354	392	quad_float operator *(const quad_float& x,const quad_float& y ) {
355	393	START_FIX
356	394	DOUBLE hx, tx, hy, ty, C, c;
357	395	DOUBLE t1, t2;
358	396
359		C = NTL_QUAD_FLOAT_SPLIT*x.hi;
	397	C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
360	398	hx = C-x.hi;
361		c = NTL_QUAD_FLOAT_SPLIT*y.hi;
	399	c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
362	400	hx = C-hx;
363	401	tx = x.hi-hx;
364	402	hy = c-y.hi;
365		C = x.hi*y.hi;
	403	C = Protect(x.hi*y.hi);
366	404	hy = c-hy;
367	405	ty = y.hi-hy;
368	406
369	407	// c = ((((hxhy-C)+hxty)+txhy)+txty)+(x.hiy.lo+x.loy.hi);
370	408
371		t1 = hx*hy;
	409	t1 = Protect(hx*hy);
372	410	t1 = t1-C;
373		t2 = hx*ty;
	411	t2 = Protect(hx*ty);
374	412	t1 = t1+t2;
375		t2 = tx*hy;
	413	t2 = Protect(tx*hy);
376	414	t1 = t1+t2;
377		t2 = tx*ty;
	415	t2 = Protect(tx*ty);
378	416	c = t1+t2;
379		t1 = x.hi*y.lo;
380		t2 = x.lo*y.hi;
	417	t1 = Protect(x.hi*y.lo);
	418	t2 = Protect(x.lo*y.hi);
381	419	t1 = t1+t2;
382	420	c = c + t1;
383	421

395	433	DOUBLE hx, tx, hy, ty, C, c;
396	434	DOUBLE t1, t2;
397	435
398		C = NTL_QUAD_FLOAT_SPLIT*x.hi;
	436	C = Protect(NTL_QUAD_FLOAT_SPLIT*x.hi);
399	437	hx = C-x.hi;
400		c = NTL_QUAD_FLOAT_SPLIT*y.hi;
	438	c = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
401	439	hx = C-hx;
402	440	tx = x.hi-hx;
403	441	hy = c-y.hi;
404		C = x.hi*y.hi;
	442	C = Protect(x.hi*y.hi);
405	443	hy = c-hy;
406	444	ty = y.hi-hy;
407	445
408	446	// c = ((((hxhy-C)+hxty)+txhy)+txty)+(x.hiy.lo+x.loy.hi);
409	447
410		t1 = hx*hy;
	448	t1 = Protect(hx*hy);
411	449	t1 = t1-C;
412		t2 = hx*ty;
	450	t2 = Protect(hx*ty);
413	451	t1 = t1+t2;
414		t2 = tx*hy;
	452	t2 = Protect(tx*hy);
415	453	t1 = t1+t2;
416		t2 = tx*ty;
	454	t2 = Protect(tx*ty);
417	455	c = t1+t2;
418		t1 = x.hi*y.lo;
419		t2 = x.lo*y.hi;
	456	t1 = Protect(x.hi*y.lo);
	457	t2 = Protect(x.lo*y.hi);
420	458	t1 = t1+t2;
421	459	c = c + t1;
422	460

437	475	DOUBLE t1;
438	476
439	477	C = x.hi/y.hi;
440		c = NTL_QUAD_FLOAT_SPLIT*C;
	478	c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
441	479	hc = c-C;
442		u = NTL_QUAD_FLOAT_SPLIT*y.hi;
	480	u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
443	481	hc = c-hc;
444	482	tc = C-hc;
445	483	hy = u-y.hi;
446		U = C * y.hi;
	484	U = Protect(C * y.hi);
447	485	hy = u-hy;
448	486	ty = y.hi-hy;
449	487
450	488	// u = (((hchy-U)+hcty)+tchy)+tcty;
451	489
452		u = hc*hy;
	490	u = Protect(hc*hy);
453	491	u = u-U;
454		t1 = hc*ty;
	492	t1 = Protect(hc*ty);
455	493	u = u+t1;
456		t1 = tc*hy;
	494	t1 = Protect(tc*hy);
457	495	u = u+t1;
458		t1 = tc*ty;
	496	t1 = Protect(tc*ty);
459	497	u = u+t1;
460	498
461	499	// c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;

463	501	c = x.hi-U;
464	502	c = c-u;
465	503	c = c+x.lo;
466		t1 = C*y.lo;
	504	t1 = Protect(C*y.lo);
467	505	c = c - t1;
468	506	c = c/y.hi;
469	507

481	519	DOUBLE t1;
482	520
483	521	C = x.hi/y.hi;
484		c = NTL_QUAD_FLOAT_SPLIT*C;
	522	c = Protect(NTL_QUAD_FLOAT_SPLIT*C);
485	523	hc = c-C;
486		u = NTL_QUAD_FLOAT_SPLIT*y.hi;
	524	u = Protect(NTL_QUAD_FLOAT_SPLIT*y.hi);
487	525	hc = c-hc;
488	526	tc = C-hc;
489	527	hy = u-y.hi;
490		U = C * y.hi;
	528	U = Protect(C * y.hi);
491	529	hy = u-hy;
492	530	ty = y.hi-hy;
493	531
494	532	// u = (((hchy-U)+hcty)+tchy)+tcty;
495	533
496		u = hc*hy;
	534	u = Protect(hc*hy);
497	535	u = u-U;
498		t1 = hc*ty;
	536	t1 = Protect(hc*ty);
499	537	u = u+t1;
500		t1 = tc*hy;
	538	t1 = Protect(tc*hy);
501	539	u = u+t1;
502		t1 = tc*ty;
	540	t1 = Protect(tc*ty);
503	541	u = u+t1;
504	542
505	543	// c = ((((x.hi-U)-u)+x.lo)-C*y.lo)/y.hi;

507	545	c = x.hi-U;
508	546	c = c-u;
509	547	c = c+x.lo;
510		t1 = C*y.lo;
	548	t1 = Protect(C*y.lo);
511	549	c = c - t1;
512	550	c = c/y.hi;
513	551

536	574	DOUBLE p,q,hx,tx,u,uu,cc;
537	575	DOUBLE t1;
538	576
539		p = NTL_QUAD_FLOAT_SPLIT*c;
	577	p = Protect(NTL_QUAD_FLOAT_SPLIT*c);
540	578	hx = (c-p);
541	579	hx = hx+p;
542	580	tx = c-hx;
543		p = hx*hx;
544		q = hx*tx;
	581	p = Protect(hx*hx);
	582	q = Protect(hx*tx);
545	583	q = q+q;
546	584
547	585	u = p+q;
548	586	uu = p-u;
549	587	uu = uu+q;
550		t1 = tx*tx;
	588	t1 = Protect(tx*tx);
551	589	uu = uu+t1;
552	590
553	591

694	732	RR::SetPrecision(long(3.33*quad_float::oprec) + 10);
695	733	RR::SetOutputPrecision(quad_float::oprec);
696	734
697		NTL_THREAD_LOCAL static RR t;
	735	NTL_TLS_LOCAL(RR, t);
698	736
699	737	conv(t, a);
700	738	s << t;

707	745	RRPush push;
708	746	RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
709	747
710		NTL_THREAD_LOCAL static RR t;
	748	NTL_TLS_LOCAL(RR, t);
711	749	NTL_INPUT_CHECK_RET(s, s >> t);
712	750	conv(x, t);
713	751

719	757	RRPush push;
720	758	RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
721	759
722		NTL_THREAD_LOCAL static RR t;
	760	NTL_TLS_LOCAL(RR, t);
723	761	random(t);
724	762	conv(x, t);
725	763	}

812	850	RRPush push;
813	851	RR::SetPrecision(4*NTL_DOUBLE_PRECISION);
814	852
815		NTL_THREAD_LOCAL static RR t;
	853	NTL_TLS_LOCAL(RR, t);
816	854	conv(t, s);
817	855	conv(x, t);
818	856

855	893	ResourceError("exp(quad_float): overflow");
856	894	}
857	895
858		// changed this from "const" to "static" in v5.3, since "const"
859		// causes the initialization to be performed with every invocation.
860		NTL_THREAD_LOCAL static quad_float Log2 =
861		to_quad_float("0.6931471805599453094172321214581765680755");
	896	static const quad_float Log2 = to_quad_float("0.6931471805599453094172321214581765680755");
	897	// GLOBAL (assumes C++11 thread-safe init)
862	898
863	899	quad_float y,temp,ysq,sum1,sum2;
864	900	long iy;

911	947
912	948
913	949	NTL_END_IMPL
	950

-2

src/thread.c less more

14	14
15	15	const string& CurrentThreadID()
16	16	{
17		NTL_THREAD_LOCAL static string ID;
18		NTL_THREAD_LOCAL static bool initialized = false;
	17	NTL_TLS_LOCAL(string, ID);
	18	static NTL_CHEAP_THREAD_LOCAL bool initialized = false;
19	19
20	20	if (!initialized) {
21	21	#ifdef NTL_THREADS

-2

src/tools.c less more

9	9
10	10	NTL_START_IMPL
11	11
12		NTL_THREAD_LOCAL void (*ErrorCallback)() = 0;
13		NTL_THREAD_LOCAL void (ErrorMsgCallback)(const char ) = 0;
	12	NTL_CHEAP_THREAD_LOCAL void (*ErrorCallback)() = 0;
	13	NTL_CHEAP_THREAD_LOCAL void (ErrorMsgCallback)(const char ) = 0;
14	14
15	15
16	16	void TerminalError(const char *s)

-3

src/unixify less more

25	25	done
26	26
27	27	cp ../include/NTL/def_config.h unix/include/NTL/config.h
28		cp ../include/NTL/have_LL_no.h unix/include/NTL/have_LL.h
29		cp ../include/NTL/have_builtin_clzl_no.h unix/include/NTL/have_builtin_clzl.h
30
31	28	cp def_makefile unix/src/makefile
	29	sh ResetFeatures unix

-1

src/vec_RR.c less more

32	32
33	33	void mul(vec_RR& x, const vec_RR& a, double b_in)
34	34	{
35		NTL_THREAD_LOCAL static RR b;
	35	NTL_TLS_LOCAL(RR, b);
36	36	conv(b, b_in);
37	37	long n = a.length();
38	38	x.SetLength(n);

+19

-1

src/vec_lzz_p.c less more

17	17	long p = zz_p::modulus();
18	18
19	19	for (i = 0; i < n; i++)
20		xp[i].LoopHole() = rem(a[i], p);
	20	xp[i].LoopHole() = rem(ap[i], p);
	21	}
	22	//
	23	// NOTE: the signature for this is in lzz_p.h
	24	void conv(vec_zz_p& x, const Vec<long>& a)
	25	{
	26	long i, n;
	27
	28	n = a.length();
	29	x.SetLength(n);
	30
	31	zz_p* xp = x.elts();
	32	const long* ap = a.elts();
	33
	34	long p = zz_p::modulus();
	35	sp_reduce_struct red_struct = zz_p::red_struct();
	36
	37	for (i = 0; i < n; i++)
	38	xp[i].LoopHole() = rem(ap[i], p, red_struct);
21	39	}
22	40
23	41

-7

src/xdouble.c less more

8	8
9	9
10	10
11		NTL_THREAD_LOCAL
	11	NTL_CHEAP_THREAD_LOCAL
12	12	long xdouble::oprec = 10;
13	13
14	14	void xdouble::SetOutputPrecision(long p)

270	270	RRPush push;
271	271	RR::SetPrecision(NTL_DOUBLE_PRECISION);
272	272
273		NTL_THREAD_LOCAL static RR t;
	273	NTL_TLS_LOCAL(RR, t);
274	274	conv(t, a);
275	275
276	276	double x;

293	293	RRPush push;
294	294	RR::SetPrecision(NTL_DOUBLE_PRECISION);
295	295
296		NTL_THREAD_LOCAL static RR t;
	296	NTL_TLS_LOCAL(RR, t);
297	297	conv(t, b);
298	298	conv(x, t);
299	299	}

512	512
513	513	double log(const xdouble& a)
514	514	{
515		NTL_THREAD_LOCAL static double LogBound = log(NTL_XD_BOUND);
	515	static const double LogBound = log(NTL_XD_BOUND); // GLOBAL (assumes C++11 thread-safe init)
516	516	if (a.x <= 0) {
517	517	ArithmeticError("log(xdouble): argument must be positive");
518	518	}

565	565
566	566	xdouble PowerOf10(const ZZ& e)
567	567	{
568		NTL_THREAD_LOCAL static long init = 0;
569		NTL_THREAD_LOCAL static xdouble v10k;
570		NTL_THREAD_LOCAL static long k;
	568	static NTL_CHEAP_THREAD_LOCAL long init = 0;
	569	static NTL_CHEAP_THREAD_LOCAL long k = 0;
	570
	571	NTL_TLS_LOCAL(xdouble, v10k);
571	572
572	573	if (!init) {
573	574	k = ComputeMax10Power();