import multiprocessing as mp from multiprocessing.pool import AsyncResult import pickle import numpy as np import matplotlib.pyplot as plt from classifier.data_models import DataModelMatrices, get_vectorized_data_for, get_vectorized_train_data from classifier.cv import cross_validations from classifier.knn import KNNPredictor from classifier.data_loader import * from classifier.misc import config_for_multiple_cpus from classifier.pca import PCA MAX_PROCESSES = config_for_multiple_cpus() MAXIMUM_K = 100 # Neighbors P_MIN = 50 P_MAX = 250 P_LIST = range(P_MIN, P_MAX + 1, 1) Q = 1000 EPSILON = 1e-7 ITERS = 1_000_000 BEST_EXPERIMENTAL_KNN = 12 BEST_EXPERIMENTAL_PCA_KNN = [(184, 14), (180, 14), (199, 17), (182, 14), (179, 14), (196, 25), (181, 12), (197, 17), (182, 11), (178, 11), (167, 13), (189, 14), (188, 14), (185, 14), (183, 14), (177, 14), (165, 14), (200, 15), (199, 15), (193, 15), (192, 15), (198, 17), (187, 11), (180, 11), (179, 11), (189, 12), (186, 12), (182, 12), (168, 13), (200, 17)] # [(148, 55), (240, 28), (243, 28), (155, 51), (155, 52), (157, 52), (173, 52), (146, 54), # (145, 56), (143, 57), (168, 57), (145, 58), (131, 69), (147, 77)] def accuracies_for_P(P): accuracy = np.zeros((MAXIMUM_K,), dtype=np.float64) print("Generando KNN para P = {}".format(P)) X_pca = X_all_pca[:,:P] data_models = DataModelMatrices(X_pca) knn = KNNPredictor(data_models, df_train, df_test, df_all) for K in range(1, MAXIMUM_K + 1): print("Ejecutando Predictor con (P = {}) y (K = {})".format(P, K)) accuracy[K-1] = knn.predict_for(K) return P, accuracy def accuracies_for_KNN(): accuracy = np.zeros((MAXIMUM_K,), dtype=np.float64) print("Generando KNN (sin PCA)") data_models = DataModelMatrices(X_all, center_with=X_tn) knn = KNNPredictor(data_models, df_train, df_test, df_all) for K in range(1, MAXIMUM_K + 1): print("Ejecutando Predictor (sin PCA) con (K = {})".format(K)) accuracy[K-1] = knn.predict_for(K) return accuracy if __name__ == "__main__": df_all = load_df() df_train = load_df_train() df_test = load_df_real_world() X_all = get_vectorized_data_for(df_all, Q) X_tn = get_vectorized_train_data(X_all, df_train) accuracies_KNN = accuracies_for_KNN() with open("results_all_knn_{}_to_{}.pkl".format(P_MIN, P_MAX), 'wb') as file: pickle.dump(accuracies_KNN, file) best_KNN = sorted(zip(range(1, MAXIMUM_K + 1), (map(float, accuracies_KNN))), key=lambda a: a[1], reverse=True)[0:5] print("Generando PCA para P = {}".format(P_MAX)) pca = PCA(X_tn, P_MAX, iterations=ITERS, epsilon = EPSILON) X_all_pca = pca.transform(X_all) with mp.Pool(processes=MAX_PROCESSES) as pool: accuracies = dict(pool.imap(accuracies_for_P, reversed(P_LIST))) with open("results_all_knn_pca_{}_to_{}.pkl".format(P_MIN, P_MAX), 'wb') as file: pickle.dump(accuracies, file) best_PCA = [(p, k, float(a[k-1])) for k in range(1, MAXIMUM_K + 1) for p, a in accuracies.items()] best_PCA = sorted(best_PCA, key=lambda a: (-a[2], a[1], -a[0]))[0:10] print("\nBest experimental parameters using KNN (K = {}) is {}".format(BEST_EXPERIMENTAL_KNN, accuracies_KNN[BEST_EXPERIMENTAL_KNN-1])) print("Best experimental parameters using KNN with PCA") for p, k in BEST_EXPERIMENTAL_PCA_KNN: print("\tPara (P = {}) y (K = {}) es {:.8g}".format(p, k, accuracies[p][k-1])) print("\nBest actual parameter using KNN") for k, a in best_KNN: print("\tPara (K = {}) es {:.8g}".format(k, a)) print("\nBest actual parameters using KNN with PCA") for p, k, a in best_PCA: print("\tPara (P = {}) y (K = {}) es {:.8g}".format(p, k, a))