#! /usr/bin/env python

#
# Copyright (C) 2019 Data Engines Corporation.
#
# Author: Dr. Andres Corrada-Emmanuel
#

#
# This code produces a CSV file simulating 9 regressors
# producing n samples
#
# Input: Number of samples desired and output filename.
# Output: A CSV file with n samples for the 9 synthetic regressors
#
# Sanity Check: Your output file should be n long.

import csv
import random

def synthetic_dataset(n, filename):

    # We open a file and point a CSV Writer at it.
    with open(filename,'w') as fp:

        writer = csv.writer(fp)
        for i in range(n):
            writer.writerow(synthetic_row())

    return


# Here is where we synthetize the row.
# Get ready to get your head blown a little
# bit. We know this from experience.
# Our algorithms do not measure accuracy, they
# measure precision. These two words are commonly
# used and have different meanings both colloquially
# and technically. We have our own definitions.
# This code is not the place to explain them. But
# the consequence of it is that, if you prepare your
# synthetic dataset correctly, you can just synthetize
# the error signal. Always aware that an error signal
# acquires different meaning depending on top of what
# signal it is placed.

# The synthetic dataset used here is very simple and
# illustrates how the synthetic error dataset needs
# to be interpreted. We are going to do uniform noise
# of constant width but different biases. This would not
# be a good description of absolute error, maybe okay
# for relative error.

# We use it because it is simple to code and describe.
# It also allows you to do some simple experiments to
# test the functionality of the online app. We'll
# point those out below.

# One possible experiment is to hide a couple of
# badly biased regressors among the set of nine
ensemble_bias = [1,1,0,0,0,0,0,0,0]

experiment_bias = random.shuffle(ensemble_bias)

def synthetic_row():
    # Here you could change the error distribution to other ones.
    # but note the following. Data Engines technology is non-parametric,
    # we have no idea what distribution produced your error sample. So
    # we do not estimate any parameters or detect what distribution best
    # explains it. That is not us. We just measure the sample error using
    # your ensemble of regressors. We bring Wisdom of the Crowd to
    # inference.
    return [random.uniform(-1+bias,1+bias) for bias in ensemble_bias]


if __name__ == "__main__":

    import sys

    n = int(sys.argv[1])
    filename = sys.argv[2]
    synthetic_dataset(n, filename)