import numpy as np import pandas as pd import matplotlib.pyplot as plt import sklearn from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_breast_cancer from sklearn.datasets import make_circles from sklearn.datasets import make_moons from sklearn.datasets import make_classification from sklearn.datasets import load_digits from sklearn.datasets import fetch_openml from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support,f1_score from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier from sklearn import tree from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from IPython.display import Image ##Funktion zur Berechnung von specificity def specificity_score(y_true, y_pred): p, r, f, s = precision_recall_fscore_support(y_true, y_pred) return r[0] ##Funktion zum automatisierten Scoring eines Models def score_model(X, y, kf): accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy_scores.append(accuracy_score(y_test, y_pred)) precision_scores.append(precision_score(y_test, y_pred)) recall_scores.append(recall_score(y_test, y_pred)) f1_scores.append(f1_score(y_test, y_pred)) print("accuracy:", np.mean(accuracy_scores)) print("precision:", np.mean(precision_scores)) print("recall:", np.mean(recall_scores)) print("f1 score:", np.mean(f1_scores)) ### # Basics ### #Aufgabe 1 numpy/durchschnitt mittelwert #data = [15, 16, 18, 19, 22, 24, 29, 30, 34] #print("mean:", np.mean(data)) #print("median:", np.median(data)) #print("50th percentile (median):", np.percentile(data, 50)) #print("25th percentile:", np.percentile(data, 25)) #print("75th percentile:", np.percentile(data, 75)) #print("standard deviation:", np.std(data)) #print("variance:", np.var(data)) #Aufgabe 2 pandas auslesen Übung #pd.options.display.max_columns = 6 #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #print(df.describe()) #Aufgabe 3 pandas daten manipulation Übung (neue Spalte hinzufügen und benennen 'male') #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #print(df.head()) #Aufgabe 4 Dataframe Shape übung (Wie ist Dataframe geformt [Anzahl Zeilen,Anzahl Spalten]) #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #arr = df[['Pclass', 'Fare', 'Age']].values #print(arr.shape) #Aufgabe 5 Summieren von Dataframe Inhalten #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #arr = df[['Pclass', 'Fare', 'Age']].values #mask = arr[:, 2] < 18 #print(mask.sum()) #print((arr[:, 2] < 18).sum()) #Aufgabe 6 Plotting Übung #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['Gender'] = df['Sex'] == 'male' #plt.scatter(df['Age'], df['Fare'], c=df['Pclass']) #plt.xlabel('Age') #plt.ylabel('Fare') #cbar = plt.colorbar() #plt.plot([0, 80], [85, 5]) #plt.show() #2.Beispiel #plt.style.use('fivethirtyeight') #fig, ax=plt.subplots() #x=df['Age'] #y1=df['Fare'] #color=df['Gender'] #size=df['Pclass'] #ax.scatter(x,y1,c=color,s=30*size,alpha=0.3) #for size in [1,2,3]: # plt.scatter([],[],c='r',s=30*size,label=str(size)+'class') # plt.legend(scatterpoints=1,frameon=False,labelspacing=1,title='Titanic') #ax.set_xlabel('Age') #ax.set_ylabel('Fare') #ax.set_xlim(0,90) #ax.set_ylim(0,555) #cbar = plt.colorbar() #plt.show() ################################################# ### # MachineLearning Algorithms mit Sklearn ### #Aufgabe 1 Pandas Daten für Model aufbereiten #Ergebnis: x=2D numpy Array(Matrix) aller Features, y=1D NumpyArray des Targets #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #print(X) #print(y) #Aufgabe 2 mit SKLearn Daten "fitten" #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #X = df[['Fare', 'Age']].values #y = df['Survived'].values #model = LogisticRegression() #model.fit(X, y) #print(model.coef_, model.intercept_) # Output sollte sein:[[ 0.01615949 -0.01549065]] [-0.51037152] #Aufgabe 3 mit SKLearn und Pandas Targetwerte vorhersagen #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #model = LogisticRegression() #model.fit(X, y) #print(model.predict([[3, True, 22.0, 1, 0, 7.25]])) #print(model.predict(X[:10])) #print(y[:10]) #y_pred = model.predict(X) #print((y == y_pred).sum()/y.shape[0]) #synonym da oft gebraucht: #print(model.score(X, y)) # Output Genauigkeit der vorhersagen: 0.8049605411499436 #print((y == y_pred).sum()) #print((y == y_pred).sum() / y.shape[0]) #print(model.score(X, y)) #Aufgabe 4 Model mit vordefiniertem Brust_krebs Datenset #cancer_data = load_breast_cancer() #print(cancer_data.keys()) # DESCR (Description ist teil der Daten) #print(cancer_data['DESCR']) #print(cancer_data['target']) #print(cancer_data['target'].shape) #print(cancer_data['target_names']) #df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) #df['target'] = cancer_data['target'] #print(df.head()) ##feature matrix/target array #X = df[cancer_data.feature_names].values #y = df['target'].values ##model aufbereiten #model = LogisticRegression(solver='liblinear') #model.fit(X, y) #model.predict([X[0]]) #print("prediction for datapoint 0:", model.predict([X[0]])) #print(model.score(X, y)) #Aufgabe 5 Bob der Baumeister ##Ziel: Input aus ,,, ## Output: 1 oder 0 ##Einlesroutine #n = int(input()) #X = [] #for i in range(n): # X.append([float(x) for x in input().split()]) #y = [int(x) for x in input().split()] #testing_datapoint = [float(x) for x in input().split()] ##Modelbau,fitting und ausgabe #model = LogisticRegression() #model.fit(X, y) #result = model.predict([testing_datapoint]) #print(result[0]) #Aufgabe 6 Metriken mit SKLEARN berechnen #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #model = LogisticRegression() #model.fit(X, y) #y_pred = model.predict(X) ##Confusion/Verwirrungs Matrix (Zeigt TN,FP,FN,TP an) #print(confusion_matrix(y, y_pred)) ##Accurcy/Genauigkeit -> Wie oft war die vorhersage richtig (TP+TN)/(TP+FP+FN+TN) #print("accuracy:", accuracy_score(y, y_pred)) ##Precicion/Präzision -> Verhältnismäßige Anzahl von Falschen Positiven (TP)/(TP+NP) ## *Note wenn Precition gegen 1 geht ist die Zahl der FalsePositives niedrig (Interessanter wenn FP gefährlicher/unerwünschter ist) #print("precision:", precision_score(y, y_pred)) ## Recall/Sensitivity/Sensibilität -> Verhältnismäßige Anzahl von FalseNegatives (TP)/(TP+FN) ## *Note wenn recall gegen 1 geht ist die Zahl der FalseNegatives niedrig (Interessant wenn FN gefährlicher/unerwünschter ist) #print("recall:", recall_score(y, y_pred)) ## F1 Score -> Durchschnitt aus precision und recall ## *Note wenn F1 score gegen 1 geht ist die anzahl an FPs und FNs niedrig -> Gute vorhersage im allgemeinen #print("f1 score:", f1_score(y, y_pred)) #Aufgabe 7 Training Data & Test Data (Verhinderung von Overfitting) #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #train_size-> Prozentanteil des Trainingsets; random_state= randomizer-seed #X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,random_state=80613) #print("whole dataset:", X.shape, y.shape) #print("training set:", X_train.shape, y_train.shape) #print("test set:", X_test.shape, y_test.shape) # building the model #model = LogisticRegression() #model.fit(X_train, y_train) # evaluating the model #y_pred = model.predict(X_test) #print("accuracy:", accuracy_score(y_test, y_pred)) #print("precision:", precision_score(y_test, y_pred)) #print("recall:", recall_score(y_test, y_pred)) #print("f1 score:", f1_score(y_test, y_pred)) #sensitivity_score = recall_score #print("sensitivity:", sensitivity_score(y_test, y_pred)) #print("specificity:", specificity_score(y_test, y_pred)) #Aufgabe 8 Receiver operating characteristic (ROC) Modulieren #*Note ROC-Kurve ist ein Graph der alle möglichen Modelle # und deren Performance anzeigt #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,random_state=80613) #model = LogisticRegression() #model.fit(X_train, y_train) #y_pred_proba = model.predict_proba(X_test) #fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1]) #plt.plot(fpr, tpr) #plt.plot([0, 1], [0, 1], linestyle='--') #plt.xlim([0.0, 1.0]) #plt.ylim([0.0, 1.0]) #plt.xlabel('1 - specificity') #plt.ylabel('sensitivity') #plt.show() ##Strategie bei der Auswahl: ##Generell gilt je weiter Links oben (hohe Sensitivity+ niedrieger 1-specifity) ##desto allgemein besser ist das Model ##Je weiter Links ##desto mehr von uns positiv vorhergesagte Resultate sind korrekt ##Je weiter oben ##desto mehr wirklich positive Resultate werden gefunden (FalsePositive Minimierung) ##Vergleich von 2 ModelVarianten gegen einander #model1 = LogisticRegression() #model1.fit(X_train, y_train) #y_pred_proba1 = model1.predict_proba(X_test) #print("model 1 AUC score:", roc_auc_score(y_test, y_pred_proba1[:, 1])) #model2 = LogisticRegression() #model2.fit(X_train[:, 0:2], y_train) #y_pred_proba2 = model2.predict_proba(X_test[:, 0:2]) #print("model 2 AUC score:", roc_auc_score(y_test, y_pred_proba2[:, 1])) ## Vergleich von unterschiedlichen Train/Test Splits gegeneinander #y_pred = model.predict(X_test) #print(" accuracy: {0:.5f}".format(accuracy_score(y_test, y_pred))) #print("precision: {0:.5f}".format(precision_score(y_test, y_pred))) #print(" recall: {0:.5f}".format(recall_score(y_test, y_pred))) #print(" f1 score: {0:.5f}".format(f1_score(y_test, y_pred))) #Aufgabe 9 KFold Cross Validierte Modelle erstellen #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #scores = [] #kf = KFold(n_splits=5, shuffle=True) #for train_index, test_index in kf.split(X): # X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # model = LogisticRegression() # model.fit(X_train, y_train) # scores.append(model.score(X_test, y_test)) #Finaler Wert der precition aus allen folds(Durchschnitt) #print(scores) #print(np.mean(scores)) #Aufgabe 10 unterschiedliche Modelle Vergleichen #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #Gleiches KFold CrossValidierungsobjekt #da sonst unfaire Verhältnisse bei den Tests (Gleiche Chunkanzahl) #kf = KFold(n_splits=5, shuffle=True) #Verschiedene Modelle mit verschiedenen Featur-Ausprägungen #X1 = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #X2 = df[['Pclass', 'male', 'Age']].values #X3 = df[['Fare', 'Age']].values #Ziel numpy array für alle gleich #y = df['Survived'].values #print("Logistic Regression with all features (Model1)") #score_model(X1, y, kf) #print() #print("Logistic Regression with Pclass, Sex & Age features(Model2)") #score_model(X2, y, kf) #print() #print("Logistic Regression with Fare & Age features(Model3)") #score_model(X3, y, kf) #Model1 und Model2 haben fast identische Werte #-> Model 2 wäre bessere wahl da weniger features (damit schneller computation) #model = LogisticRegression() #model.fit(X1, y) #print(model.predict([[3, False, 25, 0, 1, 2]])) #Aufgabe 11 Berechnen von Accuratcy/precision/recall/f1 score #tp, fp, fn, tn = [int(x) for x in input().split()] #total = tp+fp+fn+tn #print(tp) #print(fp) #print(fn) #print(tn) #print(total) #accuracy #accuracy = (tp+tn)/total #print(round(accuracy,4)) #precision #precision = (tp)/(tp+fp) #print(round(precision,4)) #recal #recall =(tp)/(tp+fn) #print(round(recall,4)) #f1 score #f1_score = (2*(precision)*recall)/(precision+recall) #print(round(f1_score,4)) ## ^^^^^^ bis hier hin war logistic regression a.k.a "parametrisches" machine learning #### ## vvvvvv ab hier entscheidungsbäume (nicht parametisch) #Aufgabe 1 Purity (Reinheit) ermitteln #Gini Impurity = 2 x x (1-) ##-> gini = 2*p*(1-p) #Entropy = -[*log2+(1-)log2(1-)] ##-> entropy = -[plog2p+(1-p)log2(1-p)] ## Purity wird in weiteren Formeln mit H abgekürzt/dargestellt ##Hinweis: Auswahl ob gini oder entropy ist nicht direkt ersichtlich ## aber beide können berechnet und abgeglichen werden um beste Modell zu wählen #Aufgabe 2 Information Gain aus Purity ermitteln # Formel: Information Gain = H(QuellNode)-((|A|/|QuellNode|)*H(A))-((|B|/|QuellNode|)*H(B)) #Aufgabe 3 Decision Tree Model erstellen #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #model = DecisionTreeClassifier() #model.fit(X_train, y_train) #print(model.predict([[3, True, 22, 1, 0, 7.25]])) #Aufgabe 4 Metriken für Decision Tree ermitteln #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values #y = df['Survived'].values #kf = KFold(n_splits=5, shuffle=True, random_state=80613) #dt_accuracy_scores = [] #dt_precision_scores = [] #dt_recall_scores = [] #lr_accuracy_scores = [] #lr_precision_scores = [] #lr_recall_scores = [] #for train_index, test_index in kf.split(X): # X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # dt = DecisionTreeClassifier(criterion='entropy') # dt.fit(X_train, y_train) # dt_accuracy_scores.append(dt.score(X_test, y_test)) # dt_y_pred = dt.predict(X_test) # dt_precision_scores.append(precision_score(y_test, dt_y_pred)) # dt_recall_scores.append(recall_score(y_test, dt_y_pred)) # lr = LogisticRegression() # lr.fit(X_train, y_train) # lr_accuracy_scores.append(lr.score(X_test, y_test)) # lr_y_pred = lr.predict(X_test) # lr_precision_scores.append(precision_score(y_test, lr_y_pred)) # lr_recall_scores.append(recall_score(y_test, lr_y_pred)) #print("Decision Tree") #print(" accuracy:", np.mean(dt_accuracy_scores)) #print(" precision:", np.mean(dt_precision_scores)) #print(" recall:", np.mean(dt_recall_scores)) #print("Logistic Regression") #print(" accuracy:", np.mean(lr_accuracy_scores)) #print(" precision:", np.mean(lr_precision_scores)) #print(" recall:", np.mean(lr_recall_scores)) # Vergleich gini vs entropy #for criterion in ['gini', 'entropy']: # print("Decision Tree - {}".format(criterion)) # accuracy = [] # precision = [] # recall = [] # for train_index, test_index in kf.split(X): # X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # dt = DecisionTreeClassifier(criterion=criterion) # dt.fit(X_train, y_train) # y_pred = dt.predict(X_test) # accuracy.append(accuracy_score(y_test, y_pred)) # precision.append(precision_score(y_test, y_pred)) # recall.append(recall_score(y_test, y_pred)) # print("accuracy:", np.mean(accuracy)) # print("precision:", np.mean(precision)) # print("recall:", np.mean(recall), '\n') #Aufgabe 5 Entscheidungsbaum Plotten #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #feature_names = ['Pclass', 'male'] #X = df[feature_names].values #y = df['Survived'].values #dt = DecisionTreeClassifier() #dt.fit(X, y) #fig = plt.figure(figsize=(10,5)) #tree.plot_tree(dt, feature_names=feature_names) #plt.show() #Aufgabe 6 Decision-Tree Pruning/Beschneidung #Methode 1 Limitierung der Verzweigungstiefe #Methode 2 Leave/Blatt-Knoten mit geringen Sample-Zahlen vermeiden #Methode 3 Limitierung der Leave/Blatt-Knoten Anzahl #BSP: #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #feature_names = ['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare'] #X = df[feature_names].values #y = df['Survived'].values #dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2, max_leaf_nodes=10) #dt.fit(X, y) #fig = plt.figure(figsize=(10,5)) #tree.plot_tree(dt, feature_names=feature_names) #plt.show() #Aufgabe 7 Finden der Besten Pruning Parameter via GridSearch #df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') #df['male'] = df['Sex'] == 'male' #feature_names = ['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare'] #X = df[feature_names].values #y = df['Survived'].values #param_grid = { # 'max_depth': [5, 15, 25], # 'min_samples_leaf': [1, 3], # 'max_leaf_nodes': [10, 20, 35, 50]} #dt = DecisionTreeClassifier() #gs = GridSearchCV(dt, param_grid, scoring='f1', cv=5) #dt.fit(X, y) #gs.fit(X, y) #print("best params:", gs.best_params_) #print("best score:", gs.best_score_) #dt_best = DecisionTreeClassifier( # max_depth=gs.best_params_["max_depth"], # min_samples_leaf=gs.best_params_["min_samples_leaf"], # max_leaf_nodes=gs.best_params_["max_leaf_nodes"]) #dt_best.fit(X,y) #fig = plt.figure(figsize=(20,10)) #tree.plot_tree(dt_best, feature_names=feature_names) #plt.show() #Pros/Cons #decision tree ist #-> (+) zeittechnisch teuer zu bauen aber predicttechnisch günstig vorher zu sagen #-> (-vorsicht: ungünstige config verteilt Aussage kraft schlecht auf viele samples a.k.a overfitting weil einzelnes sample zu mächtig #-> (+)einfach zu verstehen und zu erläutern #Aufgabe 8 Information Gain ausrechnen #S = [int(x) for x in input().split()] #A = [int(x) for x in input().split()] #B = [int(x) for x in input().split()] #-> gini = 2*p*(1-p) #Information Gain = H(QuellNode)-((|A|/|QuellNode|)*H(A))-((|B|/|QuellNode|)*H(B)) #p_source = S.count(1)/len(S) #q_source = 1 - p_source #gini_source = (2 * p_source * q_source) #print(gini_source) #p_left = A.count(1)/len(A) #q_left = 1 - p_left #gini_left = (2 * p_left * q_left) #print(gini_left) #p_right = B.count(1)/len(B) #q_right = 1 - p_right #gini_right = (2 * p_right * q_right) #print(gini_right) #gain = gini_source - ((len(A)/len(S))*gini_left) - ((len(B)/len(S))*gini_right) #print(round(gain,5)) ## ^^^^^^ bis hier hin war entscheidungsbäume a.k.a deciion trees #### ## vvvvvv ab hier Random Forests #Aufgabe 1 Erstellen des Random Forest #cancer_data = load_breast_cancer() #df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) #df['target'] = cancer_data['target'] #X = df[cancer_data.feature_names].values #y = df['target'].values #print('data dimensions', X.shape) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #rf = RandomForestClassifier() #rf.fit(X_train, y_train) #first_row = X_test[0] #print("prediction:", rf.predict([first_row])) #print("true value:", y_test[0]) #print("random forest accuracy:", rf.score(X_test, y_test)) #dt = DecisionTreeClassifier() #dt.fit(X_train, y_train) #print("decision tree accuracy:", dt.score(X_test, y_test)) #Aufgabe 2 Random Forest Tuning #cancer_data = load_breast_cancer() #df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) #df['target'] = cancer_data['target'] #X = df[cancer_data.feature_names].values #y = df['target'].values #print('data dimensions', X.shape) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #param_grid = { # 'n_estimators': [10, 25, 50, 75, 100], #} #Hinweis default bei RF ist normalerweise "auto" a.k.a SQRT(AnzahlFeatures) -> normalerweise gute Wahl #rf = RandomForestClassifier(max_features=5,n_estimators=15) #rf.fit(X_train, y_train) #rf = RandomForestClassifier(random_state=80613) ##Hinweis2 scoring = 'f1' wird meist gewählt wenn Datenset nicht sehr balanciert ist da bei unbalancierten Daten gini/accuracy schlechte ergebnisse liefert #gs = GridSearchCV(rf, param_grid, scoring='f1',cv=5) #gs.fit(X,y) #print ("best params:",gs.best_params_) #first_row = X_test[0] #print("prediction:", gs.predict([first_row])) #print("true value:", y_test[0]) #print("random forest accuracy:", gs.score(X_test, y_test)) #Aufgabe 3 Plotten von Random Forest mit "Elbow-Graph" #cancer_data = load_breast_cancer() #df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) #df['target'] = cancer_data['target'] #X = df[cancer_data.feature_names].values #y = df['target'].values #print('data dimensions', X.shape) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #n_estimators = list(range(1,101)) ##nach 100er Graph sieht man bei 10 anfang von stagnation -> für verbesserte Performance nur bis hier hin generieren, da weitere Bäume zu wenig dazugewinn sind #n_estimators = list(range(1,10)) #param_grid = { # 'n_estimators': n_estimators, #} #rf = RandomForestClassifier(random_state=80613) #gs = GridSearchCV(rf, param_grid, scoring='f1',cv=5) #gs.fit(X,y) #print ("best params:",gs.best_params_) #scores = gs.cv_results_['mean_test_score'] #first_row = X_test[0] ##print("prediction:", gs.predict([first_row])) ##print("true value:", y_test[0]) ##print("random forest accuracy:", gs.score(X_test, y_test)) #plt.plot(n_estimators, scores) #plt.xlabel("n_estimators") #plt.ylabel("accuracy") ##plt.xlim(0, 100) #plt.xlim(0, 10) #plt.ylim(0.9, 1) #plt.show() #Aufgabe 4 Feature Selection (Limitierung der genutzten Feature für Performance) #cancer_data = load_breast_cancer() #df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) #df['target'] = cancer_data['target'] #X = df[cancer_data.feature_names].values #y = df['target'].values #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #rf = RandomForestClassifier(n_estimators=10, random_state=80613) #rf.fit(X_train, y_train) #ft_imp = pd.Series(rf.feature_importances_, index=cancer_data.feature_names).sort_values(ascending=False) #print(ft_imp.head(10)) ##warum ist feature Selection wichtig: ##- schnelleres Model Training ##- reduziert komplexität ##- bei richtiger Feature Wahl -> Verbesserung der Genauigkeit da unnötige Features (Noise) entfernt wird #print(rf.score(X_test, y_test)) #worst_cols = [col for col in df.columns if 'worst' in col] #print(worst_cols) #X_worst = df[worst_cols] #X_train, X_test, y_train, y_test = train_test_split(X_worst, y, random_state=80613) #rf.fit(X_train, y_train) #print(rf.score(X_test, y_test)) #Aufgabe 5 Random Forest Pros/Cons Beispiele ##feature matrix/target array #X, y = make_circles(noise=0.2, factor=0.5, random_state=1) #df bauen für plotting anzeige #df = pd.DataFrame(X,columns=["x", "y"]) #df['target'] = y #print(df.shape) #print("x:",X) #print("y:",y) #kf = KFold(n_splits=5, shuffle=True, random_state=1) #lr_scores = [] #rf_scores = [] #for train_index, test_index in kf.split(X): # X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # lr = LogisticRegression(solver='lbfgs') # lr.fit(X_train, y_train) # lr_scores.append(lr.score(X_test, y_test)) # rf = RandomForestClassifier(n_estimators=100) # rf.fit(X_train, y_train) # rf_scores.append(rf.score(X_test, y_test)) #print("LR accuracy:", np.mean(lr_scores)) #print("RF accuracy:", np.mean(rf_scores)) #plt.scatter(df["x"],df["y"],c=y) #plt.xlabel('x') #plt.ylabel('y') #cbar = plt.colorbar() #plt.show() ##Best Practice (Benchmarking): # Bei neuem Classification Problem ist es üblich ein Linear Regression sowie Random Forest Model zu erstellen. # Diese benötigen zu beginn wenig bis kein Tuning um relativ gute Ergebnisse zu liefern. # Es ist so auch direkt ersichtlich welcher Modeltyp eine generel bessere Wahl ist. # Diese Methode gibt erste Anzeichen für mögliche/offensichtliche Optimierungen ##Aufgabe 6 Übung Aus Input Daten Sätze Random Forest bauen ##Param1 Random state für traintestsplit&RF #random_s = int(input()) ##Param2 Anzahl Datepunkte #n_datapoints = int(input()) #rows = [] ##Param3 Daten für X Array #for i in range(n_datapoints): # rows.append([float(a) for a in input().split()]) #X = np.array(rows) ##Param4 Daten für Target Werte #y = np.array([int(a) for a in input().split()]) #print("randoms: ",random_s) #print("datapoints: ",n_datapoints) #print("rows: ",rows) #print("X: ",X) #print("y: ",y) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_s) #rf = RandomForestClassifier(n_estimators=5,random_state=random_s) #rf.fit(X_train, y_train) #Output Vorhersage mit den Test Set #print(rf.predict(X_test)) #print("true value:", y_test[0]) ## ^^^^^^ bis hier hin war Random Forest #### ## vvvvvv ab hier Neural Networks #Fun Fact: Künstliches Neural Network(ANN) ist biologischem Neural Network im Gehirn Nachempfunden #-> Menschliches Gehirn hat 86 Milliarden Neuronen und ca. 100 Trillionen Synapsen sprich wenn Neurales Netzwerk mehr # als das hat dann ist es vernetzter als ein menschliches Gehirn #Basics: Neuronen #3 Activation Funktionen (Funktionen, die Input eines Neurons zu Output wandeln) #A) Sigmoid: 1/(1+e^(-x)) mit x = w1x1 + w2x2 + b- # -> Liefert Output zwischen 0 und 1 #B) hyperbolic Tangens: tanh(x) = sinh(x)/cosh(x) = (e^(x) - e^(-x))/(e^(x) + e^(-x)) # -> Liefer Output zwischen -1 und 1 #C) Rectified Linear Unit: ReLU(x) = {0 wenn x <= 0, x wenn x>0} # -> Liefert Output ab 0 bis x (negativ werte werden geschluckt) #Neuronen werden so geformt, dass deren Output oft weiteren Neuronen als Input dient # -> Multi-Layered Perceptron(MLP) # -> Feed Forward (nur in eine Richtung weiter verteilt) #Artificial Neural Network (ANN) trainieren # Grundsätzlich immer: Optimieren einer Loss-Funktion # -> Genutzt wird meist cross entropy [p wenn y = 1, 1-p wenn y = 0] ##Aufgabe 1 Generierung von Random Datensets (für test) + Plotten #X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=80613) #print(X) #print(y) #plt.scatter(X[y==0][:, 0], X[y==0][:, 1], s=100, edgecolors='k') #plt.scatter(X[y==1][:, 0], X[y==1][:, 1], s=100, edgecolors='k', marker='^') #plt.show() ##Aufgabe 2 neural network bauen #X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=80613) #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #mlp = MLPClassifier(max_iter=1000,hidden_layer_sizes=(100, 50),alpha=0.0001, solver='adam', random_state=80613) #mlp.fit(X_train, y_train) #print("accuracy:", mlp.score(X_test, y_test)) ##Aufgabe 3 Reale Datensets nutzen (MNIST für Handgeschriebene Zahlen-Zeichen) ##Hinweis MNIST Dataset hat Numernzeichen in Grayscale in en Werten 0(schwarz)-16(hellstes Weiß) gespeichert #X, y = load_digits(return_X_y=True) #print(X.shape, y.shape) #print(X[0].reshape(8,8)) #print(y[0]) ##matshow zeichnet die Daten von x in einer 8x8 Matrix in der colormap grau an ##-> nur nützlich wenn Bilddaten und Bild-Auflösung bekannt ist #plt.matshow(X[1].reshape(8,8),cmap=plt.cm.gray) ##xticks und yticks funktionen entferenn die zentrierten coordinaten lineale am Rand #plt.xticks(()) #plt.yticks(()) #plt.show() #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=80613) #mlp = MLPClassifier(random_state=80613) #mlp.fit(X_train, y_train) #x = X_test[1] #print(mlp.predict([x])) #print(mlp.score(X_test, y_test)) ##prüfen, welche nicht korrekt waren ##1) ganzen testsplit vorhersagen lassen ##2) vorhersage array nach falsch aussagen filtern und speichern ##3) vorhergesagter wert& wahrer wert für anzeige wegspeichern #y_pred = mlp.predict(X_test) #incorrect = X_test[y_pred != y_test] #incorrect_true = y_test[y_pred != y_test] #incorrect_pred = y_pred[y_pred != y_test] ##ersten anzeigen der nicht korrekt war #j = 0 #plt.matshow(incorrect[j].reshape(8,8),cmap=plt.cm.gray) #plt.xticks(()) #plt.yticks(()) #plt.show() #print("True value: ",incorrect_true[j]) #print("predicted value: ",incorrect_pred[j]) ##Aufgabe 4 Visualisierung von MLP-Gewichtung X, y = fetch_openml('mnist_784', version=1, return_X_y=True) #print(X.shape, y.shape) ##anzeigen der maximum und minimum werte #print(np.min(X), np.max(X)) #print(y[0:5]) ## Wertebereich eingrenzen X5 = X[y <= '3'] y5 = y[y <= '3'] mlp=MLPClassifier( hidden_layer_sizes=(6,), max_iter=200, alpha=1e-4, solver='sgd', random_state=80613) mlp.fit(X5, y5) ## Anzeige der Koeffizenten ## -> Anzahl der Layer (meist arrays bei mehreren Knoten welche deren gewichtungen je knoten zeigen[hier zum beispiel 6 knoten bzw deren gewichtungen]) print(mlp.coefs_) print(mlp.coefs_[0].shape) fig, axes = plt.subplots(2, 3, figsize=(5, 4)) for i, ax in enumerate(axes.ravel()): coef = mlp.coefs_[0][:, i] ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(i + 1) plt.show()