#! /usr/bin/env python # # Copyright (C) 2019 Data Engines Corporation. # # Author: Dr. Andres Corrada-Emmanuel # # # This code produces a CSV file simulating 9 regressors # producing n samples # # Input: Number of samples desired and output filename. # Output: A CSV file with n samples for the 9 synthetic regressors # # Sanity Check: Your output file should be n long. import csv import random def synthetic_dataset(n, filename): # We open a file and point a CSV Writer at it. with open(filename,'w') as fp: writer = csv.writer(fp) for i in range(n): writer.writerow(synthetic_row()) return # Here is where we synthetize the row. # Get ready to get your head blown a little # bit. We know this from experience. # Our algorithms do not measure accuracy, they # measure precision. These two words are commonly # used and have different meanings both colloquially # and technically. We have our own definitions. # This code is not the place to explain them. But # the consequence of it is that, if you prepare your # synthetic dataset correctly, you can just synthetize # the error signal. Always aware that an error signal # acquires different meaning depending on top of what # signal it is placed. # The synthetic dataset used here is very simple and # illustrates how the synthetic error dataset needs # to be interpreted. We are going to do uniform noise # of constant width but different biases. This would not # be a good description of absolute error, maybe okay # for relative error. # We use it because it is simple to code and describe. # It also allows you to do some simple experiments to # test the functionality of the online app. We'll # point those out below. # One possible experiment is to hide a couple of # badly biased regressors among the set of nine ensemble_bias = [1,1,0,0,0,0,0,0,0] experiment_bias = random.shuffle(ensemble_bias) def synthetic_row(): # Here you could change the error distribution to other ones. # but note the following. Data Engines technology is non-parametric, # we have no idea what distribution produced your error sample. So # we do not estimate any parameters or detect what distribution best # explains it. That is not us. We just measure the sample error using # your ensemble of regressors. We bring Wisdom of the Crowd to # inference. return [random.uniform(-1+bias,1+bias) for bias in ensemble_bias] if __name__ == "__main__": import sys n = int(sys.argv[1]) filename = sys.argv[2] synthetic_dataset(n, filename)