<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<title>~/ntl-10.5.0test/doc/BasicThreadPool.cpp.html</title>
<meta name="Generator" content="Vim/8.0">
<meta name="plugin-version" content="vim7.4_v2">
<meta name="syntax" content="cpp">
<meta name="settings" content="use_css,pre_wrap,no_foldcolumn,expand_tabs,prevent_copy=">
<meta name="colorscheme" content="macvim">
<style type="text/css">
<!--
pre { white-space: pre-wrap; font-family: monospace; color: #000000; background-color: #ffffff; }
body { font-family: monospace; color: #000000; background-color: #ffffff; }
* { font-size: 1em; }
.PreProc { color: #1874cd; }
.Constant { color: #ff8c00; }
.Statement { color: #b03060; font-weight: bold; }
.Comment { color: #0000ee; font-style: italic; }
.Type { color: #008b00; font-weight: bold; }
-->
</style>
<script type='text/javascript'>
<!--
-->
</script>
</head>
<body>
<pre id='vimCodeElement'>
<span class="Comment">/*</span><span class="Comment">***********************************************************************</span>
<span class="Comment">MODULE: BasicThreadPool</span>
<span class="Comment">SUMMARY:</span>
<span class="Comment">A simple thread pool class BasicThreadPool, as well as some higher-level macros</span>
<span class="Comment">which facilitite simple parallel for loops.</span>
<span class="Comment">**************************************************************************</span><span class="Comment">*/</span>
<span class="Comment">// ********************** Simple parallel for loops **************************</span>
<span class="Comment">// </span>
<span class="Comment">// We begin with a description of the higher-level macros for writing simple</span>
<span class="Comment">// parallel for loops. These facilitaties are activated only when NTL is</span>
<span class="Comment">// configured with NTL_THREAD_BOOST=on (which implies NTL_THREADS=on).</span>
<span class="Comment">// However, code that uses these facilties should still compile and run</span>
<span class="Comment">// correctly even when NTL_THREAD_BOOST=off, or even when NTL_THREADS=off, so</span>
<span class="Comment">// this is the simplest way to write parallel for loops across a range of</span>
<span class="Comment">// compile-time and run-time environments. Note that if NTL_THREADS=on, C++11</span>
<span class="Comment">// features are reqired, but when NTL_THREADS=off, these features are not</span>
<span class="Comment">// required, so the code should compile on older C++ compilers.</span>
<span class="Comment">// </span>
<span class="Comment">// Here is a simple recipe for writing parallel for loop.</span>
<span class="Comment">// </span>
<span class="Comment">// At the start of program execution, your program should execute</span>
SetNumThreads(nt);
<span class="Comment">// You can choose nt to be any positive integer, but for best results, it</span>
<span class="Comment">// should correspond to the number of available cores on your machine.</span>
<span class="Comment">// [NOTE: if NTL_THREAD_BOOST=off, this function is still defined, but does</span>
<span class="Comment">// nothing.]</span>
<span class="Comment">// </span>
<span class="Comment">// Now consider the following routine:</span>
<span class="Type">void</span> mul(ZZ *x, <span class="Type">const</span> ZZ *a, <span class="Type">const</span> ZZ *b, <span class="Type">long</span> n)
{
<span class="Statement">for</span> (<span class="Type">long</span> i = <span class="Constant">0</span>; i < n; i++)
mul(x[i], a[i], b[i]);
}
<span class="Comment">// We can parallelize it as follows:</span>
<span class="Type">void</span> mul(ZZ *x, <span class="Type">const</span> ZZ *a, <span class="Type">const</span> ZZ *b, <span class="Type">long</span> n)
{
NTL_EXEC_RANGE(n, first, last)
<span class="Statement">for</span> (<span class="Type">long</span> i = first; i < last; i++)
mul(x[i], a[i], b[i]);
NTL_EXEC_RANGE_END
}
<span class="Comment">// NTL_EXEC_RANGE and NTL_EXEC_RANGE_END are macros that just "do the right</span>
<span class="Comment">// thing". If there are nt threads available, the interval [0..n) will be</span>
<span class="Comment">// partitioned into (up to) nt subintervals, and a different thread will be</span>
<span class="Comment">// used to process each subinterval. You still have to write the for loop</span>
<span class="Comment">// yourself: the macro just declares and initializes variables "first" and</span>
<span class="Comment">// "last" (or whatever you want to call them) of type long that represent the</span>
<span class="Comment">// subinterval [first..last) to be processed by one thread.</span>
<span class="Comment">// </span>
<span class="Comment">// Note that the current thread participates as one of the nt available</span>
<span class="Comment">// threads, and that the current thread will wait for all other participating threads</span>
<span class="Comment">// to finish their task before proceeding. The current thread can be identified</span>
<span class="Comment">// as the one with first == 0.</span>
<span class="Comment">// </span>
<span class="Comment">// Withing the "body" of this construct, you can freely reference any variables</span>
<span class="Comment">// that are visible at this point. This is implemented using the C++ lambda</span>
<span class="Comment">// feature (capturing all variables by reference).</span>
<span class="Comment">// </span>
<span class="Comment">// This construct will still work even if threads are disabled, in which case</span>
<span class="Comment">// it runs single-threaded with first=0 and last=n.</span>
<span class="Comment">// </span>
<span class="Comment">// Note that the code within the EXEC_RANGE body could call other routines that</span>
<span class="Comment">// themselves attempt to execute an EXEC_RANGE: if this happens, the latter</span>
<span class="Comment">// EXEC_RANGE will detect this and run single-threaded.</span>
<span class="Comment">// </span>
<span class="Comment">// You may wish to do other things within the EXEC_RANGE body than just execute</span>
<span class="Comment">// a loop. One thing you may want to do is to declare variables. Another</span>
<span class="Comment">// thing you may want to do is setup a local context for a ZZ_p modulus (or</span>
<span class="Comment">// other type of modulus). Here is an example of doing this:</span>
<span class="Type">void</span> mul(ZZ_p *x, <span class="Type">const</span> ZZ_p *a, <span class="Type">const</span> ZZ_p *b, <span class="Type">long</span> n)
{
ZZ_pContext context;
context.save();
NTL_EXEC_RANGE(n, first, last)
context.restore();
<span class="Statement">for</span> (<span class="Type">long</span> i = first; i < last; i++)
mul(x[i], a[i], b[i]);
NTL_EXEC_RANGE_END
}
<span class="Comment">// Another useful function is AvailableThreads(), which will return the number</span>
<span class="Comment">// of available threads. If threads or thread boosting is not enabled, this</span>
<span class="Comment">// will return 1. Even if thread boosting is enabled, this may return 1 if for</span>
<span class="Comment">// whatever reason, the thread pool is not available for use (for example,</span>
<span class="Comment">// SetNumThreads was never called, or the thread pool is already active).</span>
<span class="Comment">// </span>
<span class="Comment">// A lower-level set of tools is available, which allow you to simply run a</span>
<span class="Comment">// specified number of threads. Assuming nt <= AvailableThreads(), the code</span>
NTL_EXEC_INDEX(nt, index)
... code ...
NTL_EXEC_INDEX_END
<span class="Comment">// will execute the body on nt different threads, each with a unique index in</span>
<span class="Comment">// the range [0..nt). A variable named "index" (or whatever name you specify)</span>
<span class="Comment">// of type long will hold the given index. Just as with EXEC_RANGE, the current</span>
<span class="Comment">// thread will participate as one of the nt threads, and will always be</span>
<span class="Comment">// assigned an index of 0.</span>
<span class="Comment">// </span>
<span class="Comment">// This tool is useful if you need to manage memory a bit more carefully. For</span>
<span class="Comment">// example, the following code will compute an inner product using all</span>
<span class="Comment">// available threads:</span>
ZZ InnerProd(<span class="Type">const</span> ZZ *a, <span class="Type">const</span> ZZ *b, <span class="Type">long</span> n)
{
PartitionInfo pinfo(n);
<span class="Type">long</span> cnt = pinfo.NumIntervals();
Vec<ZZ> acc;
acc.SetLength(cnt);
NTL_EXEC_INDEX(cnt, index)
<span class="Type">long</span> first, last;
pinfo.interval(first, last, index);
ZZ& sum = acc[index];
sum = <span class="Constant">0</span>;
<span class="Statement">for</span> (<span class="Type">long</span> i = first; i < last; i++)
MulAddTo(sum, a[i], b[i]);
NTL_EXEC_INDEX_END
ZZ sum;
sum = <span class="Constant">0</span>;
<span class="Statement">for</span> (<span class="Type">long</span> i = <span class="Constant">0</span>; i < cnt; i++)
sum += acc[i];
<span class="Statement">return</span> sum;
}
<span class="Comment">// This example also illustrates the class PartitionInfo, which is useful for</span>
<span class="Comment">// partitioning a large interval into smaller intervals (it is used internally</span>
<span class="Comment">// by EXEC_RANGE). The constructor takes a single argument (in this example n)</span>
<span class="Comment">// and computes a partition of [0..n) into nearly equally sized subintervals.</span>
<span class="Comment">// The method NumIntervals() returns the number of subintervals, and the method</span>
<span class="Comment">// interval(first, last, index) sets first and last according to the endpoints</span>
<span class="Comment">// of the subinterval [first..last) with the given index.</span>
<span class="Comment">// </span>
<span class="Comment">// So in this example, cnt threads will run, each accumulating a sum into a</span>
<span class="Comment">// corresponding element of the vector acc, and afterwords, these elements are</span>
<span class="Comment">// summed.</span>
<span class="Comment">// </span>
<span class="Comment">// Note that if threads are not enabled or otherwise unavailable, the above</span>
<span class="Comment">// code will compile and run correctly (just using one thread).</span>
<span class="Comment">// </span>
<span class="Comment">// Finally, there is a "guarded" version of NTL_EXEC_RANGE called</span>
<span class="Comment">// NTL_GEXEC_RANGE. This allows one to dynamically "guard" against parallel</span>
<span class="Comment">// execution. For example, on very small problems the runtime overhead of a</span>
<span class="Comment">// parallel for loop may not be worthwhile, or in other situations parallel</span>
<span class="Comment">// execution could cause incorrect behavior. See below for details.</span>
<span class="Comment">// ************************** Thread Pools ******************************</span>
<span class="Comment">// </span>
<span class="Comment">// The above facilities are built on top of a more general thread pool class,</span>
<span class="Comment">// which you may use for your own purposes.</span>
<span class="Comment">// </span>
<span class="Comment">// You create a thread pool by constructing a BasicThreadPool object. For</span>
<span class="Comment">// example:</span>
<span class="Type">long</span> nthreads = <span class="Constant">4</span>;
BasicThreadPool pool(nthreads);
<span class="Comment">// creates a thread pool of 4 threads. These threads will exist until the</span>
<span class="Comment">// destructor for pool is called. </span>
<span class="Comment">// </span>
<span class="Comment">// The simplest way to use a thread pools is as follows. Suppose you have a</span>
<span class="Comment">// task that consists of sz subtasks, indexed 0..sz-1. Then you can write:</span>
pool.exec_range(sz,
[&](<span class="Type">long</span> first, <span class="Type">long</span> last) {
<span class="Statement">for</span> (<span class="Type">long</span> i = first; i < last; i++) {
... code to process subtask i ...
}
}
);
<span class="Comment">// The second argument to exec_range is a C++11 "lambda". The "[&]" indicates</span>
<span class="Comment">// that all local variables in the calling context are captured by reference,</span>
<span class="Comment">// so the lambda body can reference all visible local variables directly.</span>
<span class="Comment">// C++11 provides other methods for capturing local variables. The interval</span>
<span class="Comment">// [0..sz) is partitioned into subintervals of the form [first..last), which</span>
<span class="Comment">// are processed by the code in the supplied lambda.</span>
<span class="Comment">// </span>
<span class="Comment">// A lower-level interface is also provided. One can write:</span>
pool.exec_index(cnt,
[&](<span class="Type">long</span> index) {
... code to process index i ...
}
);
<span class="Comment">// This will activate exactly cnt threads with indices 0..cnt-1, and execute</span>
<span class="Comment">// the given code on each index. The parameter cnt must not exceed nthreads,</span>
<span class="Comment">// otherwise an error is raised.</span>
<span class="Comment">// ====================================================================</span>
<span class="Comment">// </span>
<span class="Comment">// NOTES:</span>
<span class="Comment">// </span>
<span class="Comment">// When one activates a thread pool with nthreads threads, the *current* thread</span>
<span class="Comment">// (the one activating the pool) will also participate in the computation.</span>
<span class="Comment">// This means that the thread pool only contains nthreads-1 other threads.</span>
<span class="Comment">// </span>
<span class="Comment">// If, during an activation, any thread throws an exception, it will be caught</span>
<span class="Comment">// and rethrown in the activating thread when all the threads complete. If</span>
<span class="Comment">// more than one thread throws an exception, the first one that is caught is</span>
<span class="Comment">// the one that is rethrown.</span>
<span class="Comment">// </span>
<span class="Comment">// Methods are also provided for adding, deleting, and moving threads in and</span>
<span class="Comment">// among thread pools.</span>
<span class="Comment">// </span>
<span class="Comment">// If NTL_THREADS=off, the corresponding header file may be included, but the</span>
<span class="Comment">// BasicThreadPool class is not defined.</span>
<span class="Comment">//</span>
<span class="Comment">// Unlike most classes in NTL, the BasicThreadPool is not relocatable and hence</span>
<span class="Comment">// cannot be used in a Vec. One should first wrap it in a pointer class, such</span>
<span class="Comment">// as UniquePtr.</span>
<span class="Comment">// class BasicThreadPool: provided basic functionality for thread pools</span>
<span class="Type">class</span> BasicThreadPool {
<span class="Statement">private</span>:
BasicThreadPool(<span class="Type">const</span> BasicThreadPool&); <span class="Comment">// disabled</span>
<span class="Type">void</span> <span class="Statement">operator</span>=(<span class="Type">const</span> BasicThreadPool&); <span class="Comment">// disabled</span>
<span class="Statement">public</span>:
<span class="Type">explicit</span>
BasicThreadPool(<span class="Type">long</span> nthreads);
<span class="Comment">// creates a pool with nthreads threads, including the current thread</span>
<span class="Comment">// (so nthreads-1 other threads get created)</span>
<span class="Type">template</span><<span class="Type">class</span> Fct>
<span class="Type">void</span> exec_range(<span class="Type">long</span> sz, <span class="Type">const</span> Fct& fct);
<span class="Comment">// activate by range (see example usage above)</span>
<span class="Type">template</span><<span class="Type">class</span> Fct>
<span class="Type">void</span> exec_index(<span class="Type">long</span> cnt, <span class="Type">const</span> Fct& fct);
<span class="Comment">// activate by index (see example usage above)</span>
<span class="Type">void</span> add(<span class="Type">long</span> n = <span class="Constant">1</span>);
<span class="Comment">// add n threads to the pool</span>
<span class="Type">long</span> NumThreads() <span class="Type">const</span>;
<span class="Comment">// return number of threads (including current thread)</span>
<span class="Type">void</span> remove(<span class="Type">long</span> n = <span class="Constant">1</span>);
<span class="Comment">// remove n threads from the pool</span>
<span class="Type">void</span> move(BasicThreadPool& other, <span class="Type">long</span> n = <span class="Constant">1</span>)
<span class="Comment">// move n threads from other pool to this pool</span>
<span class="Type">bool</span> active() <span class="Type">const</span>;
<span class="Comment">// indicates an activation is in process: invoking any of the methods</span>
<span class="Comment">// exec_index, exec_range, add, remove, move, or the destructor</span>
<span class="Comment">// whie active will raise an error</span>
<span class="Type">template</span><<span class="Type">class</span> Fct>
<span class="Type">static</span> <span class="Type">void</span> relaxed_exec_range(BasicThreadPool *pool, <span class="Type">long</span> sz, <span class="Type">const</span> Fct& fct);
<span class="Comment">// similar to pool->exec_range(sz, fct), but will still work even </span>
<span class="Comment">// if !pool or pool->active(), using just the current thread</span>
<span class="Type">template</span><<span class="Type">class</span> Fct>
<span class="Type">static</span> <span class="Type">void</span> relaxed_exec_index(BasicThreadPool *pool, <span class="Type">long</span> cnt, <span class="Type">const</span> Fct& fct);
<span class="Comment">// similar to pool->exec_index(cnt, fct), but will still work even </span>
<span class="Comment">// if !pool or pool->active(), provided cnt <= 1, using just the current thread</span>
};
<span class="Comment">// THREAD BOOSTING FEATURES:</span>
<span class="Type">void</span> SetNumThreads(<span class="Type">long</span> nt);
<span class="Comment">// convenience routine to set NTL's thread pool.</span>
<span class="Comment">// If called more than once, the old thread pool is destroyed and</span>
<span class="Comment">// replaced by a new one.</span>
<span class="Comment">// If NTL_THREAD_BOOST=off, then this is still defined, but does nothing.</span>
<span class="Type">long</span> AvailableThreads();
<span class="Comment">// Number of threads currently availble to use in NTL's thread pool. This is</span>
<span class="Comment">// always at least 1 (for the current thread). </span>
<span class="Comment">// If NTL_THREAD_BOOST=off, then this is still defined, and always returns 1.</span>
BasicThreadPool *GetThreadPool();
<span class="Type">void</span> ResetThreadPool(BasicThreadPool *pool = <span class="Constant">0</span>);
BasicThreadPool *ReleaseThreadPool();
<span class="Comment">// Routines to get and set NTL's thread pool. The interfaces parallel NTL's</span>
<span class="Comment">// UniquePtr class, and indeed, behind the scenes, NTL's thread pool is stored</span>
<span class="Comment">// as a UniquePtr<BasicThreadPool>.</span>
<span class="Comment">// These are only declared when NTL_THREAD_BOOST=on. </span>
<span class="PreProc">#define NTL_EXEC_RANGE(sz, first, last) ...</span>
<span class="PreProc">#define NTL_EXEC_RANGE_END ...</span>
<span class="PreProc">#define NTL_EXEC_INDEX(cnt, index) ...</span>
<span class="PreProc">#define NTL_EXEC_INDEX_END ...</span>
<span class="Comment">// convenience macros to implement "parallel for loops" using NTL's thread</span>
<span class="Comment">// pool. See examples above for usage. If NTL_THREAD_BOOST=off, then these</span>
<span class="Comment">// are still defined, and code will run on a single thread</span>
<span class="PreProc">#define NTL_GEXEC_RANGE(seq, sz, first, last) ...</span>
<span class="PreProc">#define NTL_GEXEC_RANGE_END ...</span>
<span class="Comment">// "guarded" version of NTL_EXEC_RANGE: if seq evaluates to true, the code runs</span>
<span class="Comment">// on a single thread. This is useful in avoiding situations where the</span>
<span class="Comment">// overhead of a parallel loop is too high. If seq evaluates to the constant</span>
<span class="Comment">// true, a good compiler will optimize code to run on a single thread, with no</span>
<span class="Comment">// overhead.</span>
<span class="PreProc">#define NTL_IMPORT(x) </span>
<span class="Comment">// To be used in conjunction with NTL_EXEC_RANGE and friends. When</span>
<span class="Comment">// NTL_THREAD_BOOST=on, this will copy the variable named x from the enclosing</span>
<span class="Comment">// scope to a local copy. This should only be used for types with cheap</span>
<span class="Comment">// copies, such as scalars and pointers. In some situations, this allows the</span>
<span class="Comment">// compiler to optimize a bit more aggressively. One or more of these may be</span>
<span class="Comment">// placed right after an NTL_EXEC_RANGE.</span>
<span class="Comment">// When NTL_THREAD_BOOST=off, this is still defined, and does nothing.</span>
<span class="Comment">// class PartitionInfo: A helper class to facilitate partitioning an interval</span>
<span class="Comment">// into subintervals. NOTE: this class is available, even when</span>
<span class="Comment">// NTL_THREAD_BOOST=off.</span>
<span class="Type">class</span> PartitionInfo {
<span class="Statement">public</span>:
<span class="Type">explicit</span>
PartitionInfo(<span class="Type">long</span> sz, <span class="Type">long</span> nt = AvailableThreads());
<span class="Comment">// partitions [0..sz) into at most nt subintervals. sz may be 0 or</span>
<span class="Comment">// negative, in which case the number of subintervals is 0.</span>
<span class="Type">long</span> NumIntervals() <span class="Type">const</span>;
<span class="Comment">// return the number of subintervals</span>
<span class="Type">void</span> interval(<span class="Type">long</span>& first, <span class="Type">long</span>& last, <span class="Type">long</span> i) <span class="Type">const</span>;
<span class="Comment">// [first..last) is the ith interval, where i in [0..NumInvervals()). No</span>
<span class="Comment">// range checking is performed.</span>
};
</pre>
</body>
</html>
<!-- vim: set foldmethod=manual : -->