/* This code aims to generate random numbers in parallel using only of C++11.
 * It consists of two classes. 
 * The one doing the heavy lifiting is called ThreadedGen. It does all the 
 * buffering and threading. 
 * The other one is RandEngine. Its only purpose is to wrapp any combinaison 
 * of C++ PRG associaded with any C++ distribution.
 * The RandEngine object must be created first.
 * When a ThreadedGen is creatd, it makes several internal copies of RandEngine.
 *
 * The templated part is a bit messy but I see no better way as of now.
 * 
 * Example:
 * 
 * // Create an seed generator
 * std::mt19937 gen(13); 
 * // Create distribution with proper parameters
 * std::normal_distribution<double> dist(0.0, 1.0);
 * // Here both generator and distribution are copied
 * RandEngine<mt19937, normal_distribution<double>, double> dg(gen, dist);
 *
 * ThreadedGen<RandEngine<mt19937, normal_distribution<double>, double>,
 *             double> r1(40000, 40, 5, dg);
 * 
 * cout << r1.get_value() << endl;
 */

#ifndef THREADEDGEN_H
#define THREADEDGEN_H

#include <vector>    // std::vector
#include <thread>    // std::thread
#include <mutex>     // std::mutex
#include <random>    // std::mt19937, std::uniform_real_distribution
#include <atomic>    // std::atomic
#include <stdexcept> // std::length_error
#include <condition_variable> // std::condition_variable




/* Helper class to warp random generation 
 * Is copy constructable.
 * Gen_Type must be a COPYABLE random generator with no global state.
 * Dist_Type must be a COPYABLE distibution with no global state.
 * Return_Type is the return type of Dist_Type(Gen_Type) function.
 */
template <typename Gen_Type, typename Dist_Type, typename Return_Type> 
class RandEngine
{
public:
    RandEngine(Gen_Type gen, Dist_Type dist)
        :
        m_gen(gen),
        m_dist(dist)
    {}

    RandEngine(const RandEngine& other) 
        :
        m_gen(other.m_gen), 
        m_dist(other.m_dist)
    {}

    ~RandEngine()
    {}

    Return_Type genDist()
    {
        return m_dist(m_gen);
    }

    unsigned int genRand()
    {
        return m_gen();
    }
    
    void reSeed(unsigned int seed)
    {
        m_gen.seed(seed);
    }

private:

    Gen_Type m_gen;
    Dist_Type m_dist;

};





/* Main class.
 * Creates nbuffers of size buffer_size.
 * Creates nthreads. A subset of buffers is attributed to each set.
 * Each thread has its own independant RandEngine used to fill buffers. 
 * The RandEngine of each thread was seeded differently by a single RandEngine
 * on the main thread.
 * Each thread fills its buffers and pauses once all of its buffers are full.
 * When a thread pauses is uses no ressources. 
 * get_value() is used to get values from buffers.
 * Once a buffer is exhausted, it marked as empty and the related thread is 
 * resumed.
 * There should be more buffers than threads.
 * Making more threads than actual cores is not very costly (up to a point) but
 * not very useful.
 * For some reason, having a lot of buffers (100x nthreads) has a positive 
 * impact on performances. 
 */
template <typename RG, typename Return_Type> 
class ThreadedGen
{
public:
    ThreadedGen(unsigned int buffer_size, unsigned int nbuffers, 
                unsigned int nthreads, RG rand_gen);
    ~ThreadedGen();
    // Gets the next random number from buffers
    Return_Type get_value();

private:
    // Main loop called by the thread
    void generate_loop(unsigned int start_id, unsigned int jump, 
                       RG rand_gen);

    // Storage
    std::vector<std::vector<Return_Type>> m_buffer;
    std::vector<bool> m_buffer_lock; // Shared between threads
    // Read state
    unsigned int m_read_buffer; // Current buffer
    unsigned int m_read_iter;   // Position in the current buffer
    // Threads & related
    std::mutex  m_mtx;                  // Mutext protectig m_buffer_lock
    std::atomic<bool>  m_stop_signal;   // Shared stop signal for all threads
    std::vector<std::thread>  m_threads;
    std::vector<std::condition_variable> m_cv_write; // Write cv, one per thread
    std::condition_variable m_cv_read; // Read cv
};


/* Init members and launches thread on method generate_loop.
 */
template <typename RG, typename Return_Type> 
ThreadedGen<RG,Return_Type>::ThreadedGen(unsigned int buffer_size, 
                                  unsigned int nbuffers, 
                                  unsigned int nthreads, 
                                  RG rand_gen)
    : 
    m_buffer(std::vector<std::vector<Return_Type>>
                (nbuffers, std::vector<Return_Type>(buffer_size))),
    m_buffer_lock(std::vector<bool>(nbuffers, true)), // All buffers are locked
    m_read_buffer(nbuffers-1), // We start_id at the last buffer
    m_read_iter(buffer_size),  // We start_id after the end of the buffer (+1)
    m_stop_signal(false),
    m_cv_write(nthreads)
{
    // Range errors checking
    if(nbuffers < nthreads) 
        throw std::length_error("Number of threads cannot exceed number of" 
                                "buffers");
    if(nbuffers < 1)
        throw std::length_error("Parameter nbuffers must be above 0.");
    if(buffer_size < 1)
        throw std::length_error("Parameter buffer_size must be above 0.");
    if(nthreads < 1)
        throw std::length_error("Parameter nthreads must be above 0.");
    // Copy of the initial generator, used to seed others
    RG seeding_rg(rand_gen);
    // Copy of the initial generator, reseeded before each copy
    RG copy_rg(rand_gen);
    // Lauches each thread
    for(unsigned int i = 0; i < nthreads; ++i)
    {
        unsigned int seed = seeding_rg.genRand();
        copy_rg.reSeed(seed);
        m_threads.push_back(std::thread(&ThreadedGen::generate_loop, this, i, 
                                        nthreads, copy_rg));
    }
}


/* Destructor stops the main thread loop using m_stop_signal. 
 * All threads can then be joined.
 */ 
template <typename RG, typename Return_Type>
ThreadedGen<RG,Return_Type>::~ThreadedGen()
{
    m_stop_signal.store(true); // Stops loop
    // Notify must be protected to guarantee capture of the signal
    // Otherwise signal could be lost between while(signal) and cv.wait()
    std::unique_lock<std::mutex> lck(m_mtx);
    for(auto& cv: m_cv_write)
        cv.notify_all();
    lck.unlock(); // Unlocks because threads might need to lock during join
    for(auto& t: m_threads) t.join();

}   


/* Main thread function
 * Infinite loop that re-generates and unlocks the next buffer at each iteration
 * Each thread only writes on an exclusive subset of sbuffers 
 * This subset is defined with start_id and jump
 * The loop pauses when the next buffer is unocked, and waits until it is locked
 * A terminaisons signal can stop the loops, allowing to exitthe function
 */
template <typename RG, typename Return_Type> 
void ThreadedGen<RG,Return_Type>::generate_loop(unsigned int start_id, unsigned jump, 
                                RG rand_gen)
{
    std::unique_lock<std::mutex> lck(m_mtx);
    std::condition_variable& cv = m_cv_write[start_id]; // protect needed?
    lck.unlock();

    unsigned int write_buff_iter = start_id;
    while(1)
    {   
        //race_free_wait(write_buff_iter);
        lck.lock();
        cv.wait(lck, [&] { 
                           if (m_stop_signal.load()) return true;
                           return m_buffer_lock[write_buff_iter] == true; });
        lck.unlock();

        // Terminaison of the loop right after wait (avoids doing more work)
        if(m_stop_signal.load() == true) return;

        // (re)Fills the current buffer with random
        for(auto& i: m_buffer[write_buff_iter]) i = rand_gen.genDist();
        // Then unlocks the buffer
        lck.lock();
        m_buffer_lock[write_buff_iter] = false; // Protected (concurrent access)
        m_cv_read.notify_all(); // Protected (avoid leaking signal)
        lck.unlock();
        // Increments write buffer
        write_buff_iter = write_buff_iter + jump;
        if (write_buff_iter >= m_buffer.size()) write_buff_iter = start_id;
    }
}   


/* Get the next random value, deterministic order for a given number of threads
 * When reaches the end of a buffer, it is locked, thus allowing refilling 
 */
template <typename RG, typename Return_Type> 
Return_Type ThreadedGen<RG,Return_Type>::get_value()
{
    // If end of buffer is reached, change buffer
    if( m_read_iter >=  m_buffer[m_read_buffer].size()) 
    {
        // Current buffer:
        // get id of the thread in charge of this buffer
        unsigned int t_id = m_read_buffer % m_threads.size(); 
        // get ref to condition variable assiciated with the trait
        std::condition_variable& cv = m_cv_write[t_id];
        // Locks the buffer
        std::unique_lock<std::mutex> lck(m_mtx);
        m_buffer_lock[m_read_buffer] = true; // protected (shared resource)
        cv.notify_all(); // Send signal (protected to avoid leaking signl)   
        lck.unlock();
        // Next buffer:
        // Changes current read buffer to next and resets iterator
        m_read_buffer = (m_read_buffer + 1) % m_buffer.size();
        m_read_iter = 0;
        // Wait until next vector is unlocked (protected)
        lck.lock();
        while(m_buffer_lock[m_read_buffer]) 
        {
            m_cv_read.wait(lck);
        }
        lck.unlock();
    }    

    // Returns values while incrementing iterator
    return m_buffer[m_read_buffer][m_read_iter++];
}




#endif //THREADEDGEN_H
