//------------------------------------------------------------------------------
//
//  PROGRAM: Matrix Multipliplication driver
//
//  PURPOSE: This is a driver program to test various ways of computing
//           the product:
//
//                C  = A * B
//
//           A and B are set to constant matrices so we
//           can make a quick test of the multiplication.
//
//  USAGE:   The matrices are constant matrices, square and the order is
//           set as a constant, ORDER (see mult.h).
//
//  HISTORY: Written by Tim Mattson, August 2010 
//           Modified by Simon McIntosh-Smith, May 2011
//
//------------------------------------------------------------------------------

#include "mult.h"
#include "kernels.h"

//------------------------------------------------------------------------------
int main(int argc, char **argv)
{
    float            *A;                // A matrix 
    float            *B;                // B matrix
    float            *C;                // C matrix (C = A*B)
    int              Mdim, Ndim, Pdim;  // A[N][P], B[P][M], C[N][M] 
    int              i,j,k;             // loop indices
    int              err;               // error code returned from OpenCL calls
    int              szA, szB, szC;     // number of elements in each matrix
    size_t           global[DIM];       // global domain size  
    size_t           local[DIM];        // local  domain size  
    cl_device_id     device_id;         // compute device id 
    cl_context       context;           // compute context
    cl_command_queue commands;          // compute command queue
    cl_program       program;           // compute program
    cl_kernel        kernel;            // compute kernel
    cl_uint          nd;                // Number of dimensions in NDRange
    cl_mem           a_in;              // device memory used for the input  a vector
    cl_mem           b_in;              // device memory used for the input  b vector
    cl_mem           c_out;             // device memory used for the output c vector
    double           start_time;        // Starting time
    double           run_time;      // timing data

    Ndim = ORDER;
    Pdim = ORDER;
    Mdim = ORDER;

    szA = Ndim*Pdim;    szB = Pdim*Mdim;     szC = Ndim*Mdim;
    A   = (float *)malloc(szA*sizeof(float));
    B   = (float *)malloc(szB*sizeof(float));
    C   = (float *)malloc(szC*sizeof(float));
    initmat(Mdim, Ndim, Pdim, A, B, C);
 
//--------------------------------------------------------------------------------   
// Do the matrix product sequentially on the CPU.
//--------------------------------------------------------------------------------   
#ifndef NO_SEQUENTIAL
    printf("\n===== Sequential, matrix mult (dot prod), order %dx%d on host CPU ======\n",ORDER,ORDER);
    for(i=0; i<COUNT; i++){
       zero_mat(Ndim, Mdim, C);
       start_time = wtime(); 

            seq_mat_mul_sdot(Mdim, Ndim, Pdim, A, B, C);        

       run_time  = wtime() - start_time;
       results(Mdim, Ndim, Pdim, C, run_time);
    }
#endif

//--------------------------------------------------------------------------------   
// Setup the OpenCL platform 
//--------------------------------------------------------------------------------   

    // Get platform IDs
    cl_uint nPlatforms;
    err  = clGetPlatformIDs( 0, NULL, &nPlatforms);
    cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * nPlatforms);
    err  = clGetPlatformIDs( 1, platforms, &nPlatforms);
    printf("\nNumber of platform IDs = %d\n",nPlatforms);

    // Connect to a GPU (gpu=1) or a CPU (gpu=0)
    int gpu = 1; 
    err = clGetDeviceIDs(platforms[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
        free(platforms); platforms = NULL;
    if (err != CL_SUCCESS)              
    {
        if (err ==  CL_INVALID_PLATFORM ) printf("Error: platform is not a valid platform.\n");
        if (err == CL_INVALID_DEVICE_TYPE) printf("Error: device_type is not a valid value.\n");
        if (err == CL_INVALID_VALUE) printf("Error: num_entries is equal to zero and device_type is not NULL or if both num_devices and device_type are NULL.\n");
        if (err == CL_DEVICE_NOT_FOUND) printf("Error: CL_DEVICE_NOT_FOUND\n");
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    err = output_device_info(device_id);

    // Create a compute context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

//--------------------------------------------------------------------------------   
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------   

    a_in   = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * szA, NULL, NULL);
    b_in   = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * szB, NULL, NULL);
    c_out  = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
    if (!a_in || !b_in || !c_out)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
    
    //  Reset A, B and C matrices (just to play it safe)
    initmat(Mdim, Ndim, Pdim, A, B, C);

    // Write the A and B matrices into compute device memory 
    err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write a_data to source array!\n",err_code(err));
        exit(1);
    }
    err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write b_data to source array!\n",err_code(err));
        exit(1);
    }
	
//--------------------------------------------------------------------------------   
// OpenCL matrix multiplication ... Naive
//--------------------------------------------------------------------------------   
    // setup kernel,  print header

    err = setup_kern_c_elem( device_id,  context, Ndim, Pdim, Mdim, a_in,  b_in, c_out,
                             &program, &kernel, global, &nd);
    if(err != SUCCESS)
       printf("failed to build kernel");
    else
    {

       // Do the multiplication COUNT times
       for(i=0; i<COUNT; i++){
          zero_mat(Ndim, Mdim, C);
          start_time = wtime(); 

          // Execute the kernel over the entire range of C matrix elements ... computing
          // a dot product for each element of the product matrix.  The local work
          // group size is set to NULL ... so I'm telling the OpenCL runtime to 
          // figure out a locall work group size for me.
             err = clEnqueueNDRangeKernel(commands, kernel, nd, NULL, global, NULL, 
                               0, NULL, NULL);
             if (err != CL_SUCCESS)
             {
                printf("Error: Failed to execute kernel!\n",err_code(err));
                continue;
             }

             // Wait for the commands to complete before reading back results
             clFinish(commands);

          run_time  = wtime() - start_time;

          // Read back the results from the compute device
          err = clEnqueueReadBuffer( commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, 
                                                                               NULL, NULL );  
          run_time  = wtime() - start_time;
          if (err != CL_SUCCESS)
              printf("Error: Failed to read output array! \n", err_code(err));
          else
             results(Mdim, Ndim, Pdim, C, run_time);

       }  //end for loop
      
       // Cleanup for next kernel
       clReleaseProgram(program);
       clReleaseKernel(kernel);
    
    }  // end of kernel execution
	
//--------------------------------------------------------------------------------   
// OpenCL matrix multiplication ... C row per work item
//--------------------------------------------------------------------------------   
   
 
//--------------------------------------------------------------------------------   
// OpenCL matrix multiplication ... C row per work item, A row in pivate memory
//--------------------------------------------------------------------------------   


//--------------------------------------------------------------------------------   
// OpenCL matrix multiplication ... C row per work item, A row pivate, b col local
//--------------------------------------------------------------------------------   


//--------------------------------------------------------------------------------   
// Cleanup platform and memory objects.
//--------------------------------------------------------------------------------   
    clReleaseMemObject(a_in);
    clReleaseMemObject(b_in);
    clReleaseMemObject(c_out);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
    printf("\n all done \n");
}
