#include "clErrorClass.h"
#include "speedTestClass.h"
#include "OpenCLWrapperClass.h"
#include <iostream>
#include <fstream>
#include "calc.h"


#define INSTEPS 1024*1024*1024
#define ITERS 256
//#define ITERS 8192
//#define ITERS 65536
//#define ITERS 262144

#define N  100
#define NN 500


speedTestClass spd;     // evZX̎sx

#define FILE_NAME "calc.cl"
#define KERNEL_NAME "pi_vec8"


int main(void)
{
    cl_int status;
    int           in_nsteps = INSTEPS;
    int           niters    = ITERS;
    int           nsteps;
    float         *psum_data;             // vector to hold partial sum
    float         step_size;
    size_t        nwork_groups;
    size_t        max_size, work_group_size = 32;
    size_t        global;                 // global domain size
	float pi;


    // OpenCL̏.OpenCLŎgpIuWFNg܂Ƃ߂NXCX^X.
    OpenCLWrapperClass *ocl = new OpenCLWrapperClass();

#if 1
    for(unsigned int i=0; i< ocl->getUsableOpenCLDeviceNum(); ++i){
        ocl->setUseDevice(i);
        ocl->showPlatformInfoAll();
        ocl->showDeviceInfoAll();
        DBG(\n);
    }
#endif
    // gpfoCXI
    ocl->setUseDevice(0);

    // vOIuWFNg̍쐬
    ocl->createProgramObject(FILE_NAME);

    // J[l̍쐬
    ocl->createKernel(FILE_NAME, "addVector");
    ocl->createKernel(FILE_NAME, "pi");
    ocl->createKernel(FILE_NAME, "pi_vec4");
    ocl->createKernel(FILE_NAME, "pi_vec8");

	for(unsigned int loop=0;loop<2;++loop){
	    ocl->setUseDevice(loop);
		// [NACe̐Ƃ̌vZ
		max_size    = *(size_t *)ocl->getDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE);
		if(max_size > work_group_size) work_group_size = max_size;    // G[

		nwork_groups    = in_nsteps/(work_group_size*niters);
		psum_data       = (float*)malloc(sizeof(float)*nwork_groups);
		work_group_size = in_nsteps/(nwork_groups*niters);
		global          = nwork_groups * work_group_size;
		nsteps          = work_group_size * niters * nwork_groups;
		step_size       = (float )1.0/(float)nsteps;
		max_size        = (size_t)ocl->getDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE);

		// IuWFNg̍쐬
		ocl->createMemory(CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups , NULL  , "partial_sums");

		// J[l̈Zbg
		ocl->setKernelArgs(KERNEL_NAME,                                         // kernel name
			&niters                       , sizeof(int)    ,                    //  1
			&step_size                    , sizeof(float)  ,                    //  2
			(void *)NULL                  , sizeof(float)  *work_group_size,    //  3
			ocl->getMemory("partial_sums"), sizeof(cl_mem) );                   //  4


		// J[lGPUpPIZo
		if(loop)spd.st("run Kernel Function getPI on cpu");// start
		else spd.st("run Kernel Function getPI on gpu");// start
		size_t globalSize[] = {nwork_groups * work_group_size};

		// J[l̎s
		ocl->runKernel(KERNEL_NAME,                     // sJ[l
					   1,                               // 
					   globalSize,                      // O[õ[NXy[XTCY
					   &work_group_size);               // [NO[ṽTCY
		// ʂ̎擾
		ocl->getResult(KERNEL_NAME,						// sJ[l
					   "partial_sums",					// ʂi[Ă郁
					   sizeof(float) * nwork_groups,	// ʂ̃TCY
					   psum_data);						// i[̔z̎Q
		pi=0;
		for (unsigned int i=0;i<nwork_groups;i++){
			pi += psum_data[i];
		}
		pi = pi * step_size;
		spd.st("run Kernel Function");// stop
		std::cout << pi << std::endl;
	}

	// CPUpPIZo
    spd.st("run CPU Function getPI");// start
        pi = getPI(nsteps);
    spd.st("run CPU Function getPI");// stop
    std::cout << pi << std::endl;


	// CPUOpenMPpPIZo
    spd.st("run CPU Function getPI OMP");// start
        pi = getPI_OMP(nsteps);
    spd.st("run CPU Function getPI OMP");// stop
    std::cout << pi << std::endl;


    // Ԃ̕\
    spd.showTimeAll();

    // \[X̊J
    delete ocl;

DBG(press enter);
NL(1);
    return 0;
}
