package dataprocessing;

import java.io.*;
import java.util.*;

import ucs.UCSconfig;

/**
 * General purpose datafile interface.  Handles all data manipulations.
 */
public class DataSource
{
	/** Data structure holding all the examples */
	public ArrayList exampleList = null;

	public int numInputs    = -1;
	private int trainingSize = -1;
	private int testSize;

	public String filename="";

//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Constructor for a DataSource object. Opens the specified file and reads in all data, which
	 * must be in CSV (comma-separated-values) format, with one example per line.  The last value on each line is
	 * treated as the target variable.
	 * @param filename A string specifying a file from which to load some data.
	 */
	public DataSource ( String filename )
	{
		this.filename=filename;
		BufferedReader myReader = null;
		
		try
		{
			// Attempt to open the file
			//
			myReader = new BufferedReader( new FileReader( new File( filename ) ) );
		}
		catch( FileNotFoundException ex )
		{
			System.err.println( "Datafile '"+filename+"' not found." );
			System.exit(1);
		}
		
		
		//Initialise the data structure to hold our training or testing examples
		//
		exampleList = new ArrayList();

		////////////////////////////////////////////////////////
		
		try
		{
			ArrayList wholeFile = new ArrayList();
			String line = "";
			StringTokenizer myTokenizer = null;

			//Read in the file completely so we can find out what the
			// legal value list is for each attribute
			HashSet localInputAlphabet[] = null;
			HashSet localTargetAlphabet = new HashSet();;
			while (myReader.ready())
			{
				line = myReader.readLine();
				while(line.trim().length()==0 && myReader.ready()) line = myReader.readLine();
				if (line.length()==0) break;
				
				myTokenizer = new StringTokenizer( line, ", \t" );
				wholeFile.add(line);

				//JUST DO THIS ONCE WHEN WE ARE INITIALIZING
				if(numInputs==-1) {
					numInputs = myTokenizer.countTokens()-1;
					localInputAlphabet = new HashSet[numInputs];
					for (int i=0; i<numInputs; i++) localInputAlphabet[i] = new HashSet();
				}

			
				//ADD THE ITEM TO THE LISTS
				for (int i=0; i<numInputs; i++)
				{
					String val = myTokenizer.nextToken();
					localInputAlphabet[i].add( val );
				}
				
				//DON'T FORGET THE TARGET VARIABLE
				localTargetAlphabet.add( myTokenizer.nextToken() );
			} 
			
			Example.inputAlphabet = new ArrayList[numInputs];
			for (int i=0; i<numInputs; i++)
			{
				//SORT THE INPUT ALPHABETS
				Example.inputAlphabet[i] = new ArrayList();
				Iterator it = localInputAlphabet[i].iterator();
				while (it.hasNext()) Example.inputAlphabet[i].add(it.next());
				Collections.sort(Example.inputAlphabet[i]);
				//System.out.println(i+" = "+Example.inputAlphabet[i]);
			}
			
			//SORT THE TARGET ALPHABET
			Example.targetAlphabet = new ArrayList();
			Iterator it = localTargetAlphabet.iterator();
			while (it.hasNext()) Example.targetAlphabet.add(it.next());
			Collections.sort(Example.targetAlphabet);
			//System.out.println("T = "+Example.targetAlphabet);
			
			myReader.close();
			
			////////////////////////////////////////////////////////
			
			//Make an iterator to go over the lines we collected
			//
			Iterator wholeFileIterator = wholeFile.iterator();
			
			// Loop round this while we have not reached the end of the iterator
			//
			while (wholeFileIterator.hasNext())
			{	
				//Read one line from the iterator (corresponding to one Example)
				//
				line = (String)wholeFileIterator.next();
				
				//Break that line up into chunks separated by commas, spaces or tabs
				//
				myTokenizer = new StringTokenizer( line, ", \t" );
				
				//Initialise a data structure to hold this particular Example
				//
				Example thisExample = new Example( numInputs );

				//Loop through each chunk of the line we read from the file, adding to our data structure
				//
				int attrib=0;
				while (attrib < numInputs)
				{
					//Double val = new Double( myTokenizer.nextToken() );
					String val = myTokenizer.nextToken();
					int pos = Example.inputAlphabet[attrib].indexOf(val);
					
					if (pos==-1)
					{
						System.out.println("SYMBOL NOT RECOGNISED!");
						System.exit(1);
					}

					thisExample.inputs[attrib] = pos;

					attrib++;
				}

				//Now read the target value
				//
				String val = myTokenizer.nextToken();
				int pos = Example.targetAlphabet.indexOf(val);
				if (pos==-1)
				{
					System.out.println("SYMBOL NOT RECOGNISED!");
					System.exit(1);
				}
				else thisExample.target = pos;

				
				//Add this Example to our list of examples
				//
				exampleList.add( thisExample );
			}
		}
		catch (IOException ioe)
		{
			System.err.println( "IO Exception when reading datafile '"+filename+"'." );
			System.exit(1);
		}
		
		System.gc();
	}
	
//////////////////////////////////////////////////////////////////////////////////////
	
	/**
	 * Returns the total number of Examples available in this DataSource.
	 * @return The number of Examples.
	 */
	public int numExamples()
	{
		return exampleList.size();
	}
	
//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Returns an ArrayList of <b>Example</b> objects, containing all data in this source.
	 * @return The number of Examples.
	 */
	public ArrayList getData()
	{
		return exampleList;
	}

//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Randomly shuffles this DataSource.  After a call to this method, all data in this source will be in random order.
     * @return Nothing.
	 */
	public void shuffle()
	{
		ArrayList shuffled = new ArrayList();
		Random gen = new Random();
		
		//System.out.println("before: "+exampleList.size());
		
		while (exampleList.size() > 0)
		{
			int eg = gen.nextInt(exampleList.size());
			shuffled.add( exampleList.get(eg) );
			exampleList.remove(eg);
		}
		
		//System.out.println("after: "+shuffled.size());

		exampleList = shuffled;
	}

//////////////////////////////////////////////////////////////////////////////////////

        /**
         * Returns an ArrayList of Example objects to use as training data.  
         * This method returns the first 50% of the data as training data, but
	 * we could easily alter that.
         * @return An ArrayList of Examples.
         */
        public ArrayList getTrainingData()
        {
                if (trainingSize == -1) trainingSize = exampleList.size()/2;

                ArrayList training = new ArrayList();

                for (int i=0; i<trainingSize; i++)
                        training.add( exampleList.get(i) );

                return training;
        }

	
//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Returns an ArrayList of Example objects to use as training data
	 * by splitting the file into train and test folds.
	 * @return An ArrayList of Examples.
	 */
    /*    	public ArrayList getTrainingFolds(int folds)
	{
	    // put n-1 folds in the training set
	    testSize = exampleList.size()/folds;
	    trainingSize = exampleList.size() - testSize;

	    System.out.println("examples = " + exampleList.size() + " trainingSize = " + trainingSize + " testSize = " + testSize);
		
	    ArrayList training = new ArrayList();
	    
	    for (int i=0; i<trainingSize; i++)
		training.add( exampleList.get(i) );
	    
	    return training;
	    }*/

	
//////////////////////////////////////////////////////////////////////////////////////

    /**
     * Splits the Example objects in a file into the specified number of folds and
     * stores them in the UCSconfig object it is passed. If the number of examples doesn't
     * divide by folds evenly and remaining examples are ignored.
     */
    public void initialFolding(int folds, UCSconfig params)
    {
	if (params.verbosity >= 2)	
	    System.out.println("Peforming initial folding into " + folds + " folds");
	// put n-1 folds in the training set and 1 fold in the test set
	int foldSize = exampleList.size()/folds;
	testSize = foldSize;
	trainingSize = foldSize * (folds -1);
	
	if (params.verbosity >= 2)	
	    System.out.println("examples = " + exampleList.size() + " trainingSize = " + trainingSize + " testSize = " + testSize);
	
	params.allFolds = new ArrayList();

	int nextExample = 0;
	for (int fold=0; fold < folds; fold++) {
	    ArrayList nextFold = new ArrayList();
	    
	    for (int i=0; i<foldSize; i++) {
		nextFold.add( exampleList.get(nextExample) );
		nextExample++;
	    }

	    params.allFolds.add(nextFold);
	}
    }
    
//////////////////////////////////////////////////////////////////////////////////////
	
	/**
	 * Returns an ArrayList of Example objects to use as testing data.  
	 * @return An ArrayList of Examples.
	 */
	public ArrayList getTestingData()
	{
		ArrayList testing = new ArrayList();
		testSize = exampleList.size() - trainingSize;
		
		for (int i=testSize; i<exampleList.size(); i++)
			testing.add( exampleList.get(i) );
			
		return testing;
	}

//////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Prints out every Example in this DataSource.
	 */
	public void printData()
	{
		for (int whichExample=0; whichExample<numExamples(); whichExample++)
		{
			//Retrieve the Example at index number 'whichExample'
			//
			Example thisExample = (Example)exampleList.get(whichExample);
			
			//Print it
			//
			thisExample.print();
		}
	}


//////////////////////////////////////////////////////////////////////////////////////	
	
	/**
	 * Adds noise to the target of the supplied ArrayList with probability prob.
	 * @param prob The probability of class noise being added to each Example.
	 * @param data An ArrayList of Examples to apply the noise to.
	 */
	public void addTargetNoise(double prob, ArrayList data)
	{
		for (int whichExample=0; whichExample<data.size(); whichExample++)
		{
			//Retrieve the Example at index number 'whichExample'
			//
			Example ex = (Example)data.get(whichExample);
			
			if(Math.random() < prob)
			{
				//corrupt the data
				//
				if(ex.target==1) ex.target=0; else ex.target=1;
			}
		}
	}


//////////////////////////////////////////////////////////////////////////////////////	
	
}

