#Codes From Sentdex Tutorials https://www.youtube.com/user/sentdex
#For Scikit learn codes and tutorial go to scikit learn blog https://abeeweeda.blogspot.com/2019/04/some-python-scikit-learn.html #regression import pandas as pd import quandl, math import numpy as np import pandas as pd from sklearn import preprocessing, cross_validation, svm from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from matplotlib import style import datetime style.use('ggplot') df = quandl.get("WIKI/GOOGL") df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) forecast_out = int(math.ceil(0.01 * len(df))) df['label'] = df[forecast_col].shift(-forecast_out) X = np.array(df.drop(['label'], 1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) clf = LinearRegression(n_jobs=-1) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) forecast_set = clf.predict(X_lately) df['Forecast'] = np.nan last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] df['Adj. Close'].plot() df['Forecast'].plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') plt.show() #KNN manual import numpy as np from math import sqrt import warnings from collections import Counter import pandas as pd import random def k_nearest_neighbors(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to a value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances)[:k]] #print(votes) #print('------------------', Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] confidence = Counter(votes).most_common(1)[0][1] / k return vote_result, confidence df = pd.read_csv(r'C:\Users\avishek\Downloads\DataFiles\B_C.csv') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) full_data = df.astype(float).values.tolist() random.shuffle(full_data) test_size = 0.4 train_set = {2: [], 4: []} test_set = {2: [], 4: []} train_data = full_data[:-int(test_size * len(full_data))] test_data = full_data[-int(test_size * len(full_data)):] for i in train_data: train_set[i[-1]].append(i[:-1]) for i in test_data: test_set[i[-1]].append(i[:-1]) correct = 0 total = 0 for group in test_set: for data in test_set[group]: vote, confidence = k_nearest_neighbors(train_set, data, k=5) if group == vote: correct += 1 total += 1 print('Accuracy:', correct / total) #nearest neighbour google manual from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from scipy.spatial import distance import random def euc(a,b): return distance.euclidean(a,b) iris = datasets.load_iris() X = iris.data y = iris.target class MyKnn(): def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train def predict(self, X_test): self.X_test = X_test prediction = [] for row in X_test: label = self.closest(row) prediction.append(label) return prediction def closest(self, row): distance = euc(row, self.X_train[0]) index = 0 for i in range(1, len(self.X_train)): dis = euc(row, X_train[i]) if dis < distance: distance = dis index = i return self.y_train[index] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5) classifier = MyKnn() classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) print(accuracy_score(y_test, prediction)) # SVM manual import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') class Support_Vector_Machine: def __init__(self, visualization=True): self.visualization = visualization self.colors = {1: 'r', -1: 'b'} if self.visualization: self.fig = plt.figure() self.ax = self.fig.add_subplot(1, 1, 1) # train def fit(self, data): self.data = data # { ||w||: [w,b] } opt_dict = {} transforms = [[1, 1], [-1, 1], [-1, -1], [1, -1]] all_data = [] for yi in self.data: for featureset in self.data[yi]: for feature in featureset: all_data.append(feature) self.max_feature_value = max(all_data) self.min_feature_value = min(all_data) all_data = None # support vectors yi(xi.w+b) = 1 step_sizes = [self.max_feature_value * 0.1, self.max_feature_value * 0.01, # point of expense: self.max_feature_value * 0.001, ] # extremely expensive b_range_multiple = 2 # we dont need to take as small of steps # with b as we do w b_multiple = 5 latest_optimum = self.max_feature_value * 10 for step in step_sizes: w = np.array([latest_optimum, latest_optimum]) # we can do this because convex optimized = False while not optimized: for b in np.arange(-1 * (self.max_feature_value * b_range_multiple), self.max_feature_value * b_range_multiple, step * b_multiple): for transformation in transforms: w_t = w * transformation found_option = True # weakest link in the SVM fundamentally # SMO attempts to fix this a bit # yi(xi.w+b) >= 1 # # #### add a break here later.. for i in self.data: for xi in self.data[i]: yi = i if not yi * (np.dot(w_t, xi) + b) >= 1: found_option = False # print(xi,':',yi*(np.dot(w_t,xi)+b)) if found_option: opt_dict[np.linalg.norm(w_t)] = [w_t, b] if w[0] < 0: optimized = True print('Optimized a step.') else: w = w - step norms = sorted([n for n in opt_dict]) # ||w|| : [w,b] opt_choice = opt_dict[norms[0]] self.w = opt_choice[0] self.b = opt_choice[1] latest_optimum = opt_choice[0][0] + step * 2 for i in self.data: for xi in self.data[i]: yi = i print(xi, ':', yi * (np.dot(self.w, xi) + self.b)) def predict(self, features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features), self.w) + self.b) if classification != 0 and self.visualization: self.ax.scatter(features[0], features[1], s=200, marker='*', c=self.colors[classification]) return classification def visualize(self): [[self.ax.scatter(x[0], x[1], s=100, color=self.colors[i]) for x in data_dict[i]] for i in data_dict] # hyperplane = x.w+b # v = x.w+b # psv = 1 # nsv = -1 # dec = 0 def hyperplane(x, w, b, v): return (-w[0] * x - b + v) / w[1] datarange = (self.min_feature_value * 0.9, self.max_feature_value * 1.1) hyp_x_min = datarange[0] hyp_x_max = datarange[1] # (w.x+b) = 1 # positive support vector hyperplane psv1 = hyperplane(hyp_x_min, self.w, self.b, 1) psv2 = hyperplane(hyp_x_max, self.w, self.b, 1) self.ax.plot([hyp_x_min, hyp_x_max], [psv1, psv2], 'k') # (w.x+b) = -1 # negative support vector hyperplane nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1) nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1) self.ax.plot([hyp_x_min, hyp_x_max], [nsv1, nsv2], 'k') # (w.x+b) = 0 # positive support vector hyperplane db1 = hyperplane(hyp_x_min, self.w, self.b, 0) db2 = hyperplane(hyp_x_max, self.w, self.b, 0) self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2], 'y--') plt.show() data_dict = {-1: np.array([[1, 7], [2, 8], [3, 8], ]), 1: np.array([[5, 1], [6, -1], [7, 3], ])} svm = Support_Vector_Machine() svm.fit(data=data_dict) predict_us = [[0, 10], [1, 3], [3, 4], [3, 5], [5, 5], [5, 6], [6, -5], [5, 8]] for p in predict_us: svm.predict(p) svm.visualize() #k-means import matplotlib.pyplot as plt from matplotlib import style import numpy as np from sklearn.cluster import KMeans style.use('ggplot') #k-means sklearn X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]]) #plt.scatter(X[:, 0],X[:, 1], s=80, linewidths = 2, zorder = 5) #plt.show() #print(X[:, 0]) clf = KMeans(n_clusters=2) clf.fit(X) centroid = clf.cluster_centers_ labels = clf.labels_ print(centroid) print('-----------') print(labels) colors = ['r.', 'g.'] for i in range(len(X)): plt.plot(X[i, 0], X[i, 1], colors[labels[i]]) plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', s=150) plt.show() #k-means manual import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np X = np.array([[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6], [9,11]]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10*["g","r","c","b","k"] class K_Means: def __init__(self, k=2, tol=0.001, max_iter=300): self.k = k self.tol = tol self.max_iter = max_iter def fit(self, data): self.centroids = {} for i in range(self.k): self.centroids[i] = data[i] for i in range(self.max_iter): self.classifications = {} for i in range(self.k): self.classifications[i] = [] for featureset in data: distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) self.classifications[classification].append(featureset) prev_centroids = dict(self.centroids) for classification in self.classifications: self.centroids[classification] = np.average(self.classifications[classification], axis=0) optimized = True for c in self.centroids: original_centroid = prev_centroids[c] current_centroid = self.centroids[c] if np.sum((current_centroid - original_centroid) / original_centroid * 100.0) > self.tol: print(np.sum((current_centroid - original_centroid) / original_centroid * 100.0)) optimized = False if optimized: break def predict(self,data): distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) return classification clf = K_Means() clf.fit(X) for centroid in clf.centroids: plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) plt.show() #k-means mine similar to sentdex but written by me import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np import operator X = np.array([[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6], [9,11]]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10*["g","r","c","b","k"] class k_means(): def __init__(self, k=2, tol=.001, max_iter=300): self.k = k self.tol = tol self.max_iter = max_iter def fit(self, Data): self.centroids = {} for i in range(self.k): self.centroids[i] = Data[i] self.classifications = {} for i in range(self.max_iter): for i in range(self.k): self.classifications[i] = [] for feature in Data: distance = [] for centroid in self.centroids: distance.append(np.linalg.norm(feature-self.centroids[centroid])) min_distance_index = distance.index(min(distance)) #if feature not in self.classifications[min_distance_index]: self.classifications[min_distance_index].append(feature) #print(self.classifications) prev_centroids = dict(self.centroids) for i in self.classifications: self.centroids[i] = np.average(self.classifications[i], axis=0) for i in self.centroids: self.tolerance = [] current_centroid = self.centroids[i] prev_centroid = prev_centroids[i] self.tolerance.append(np.sum((current_centroid - prev_centroid) / prev_centroid * 100.0)) optimized = False if sum(self.tolerance) <= self.k*self.tol: optimized = True if optimized: break clf = k_means() clf.fit(X) #print(clf.classifications) #print(clf.centroids) for centroid in clf.centroids: plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) plt.show() #mean shift manual import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11], [8, 2], [10, 2], [9, 3], ]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10 * ["g", "r", "c", "b", "k"] class Mean_Shift: def __init__(self, radius=4): self.radius = radius def fit(self, data): centroids = {} for i in range(len(data)): centroids[i] = data[i] while True: new_centroids = [] for i in centroids: in_bandwidth = [] centroid = centroids[i] for featureset in data: if np.linalg.norm(featureset - centroid) < self.radius: in_bandwidth.append(featureset) new_centroid = np.average(in_bandwidth, axis=0) new_centroids.append(tuple(new_centroid)) uniques = sorted(list(set(new_centroids))) prev_centroids = dict(centroids) centroids = {} for i in range(len(uniques)): centroids[i] = np.array(uniques[i]) optimized = True for i in centroids: if not np.array_equal(centroids[i], prev_centroids[i]): optimized = False if not optimized: break if optimized: break self.centroids = centroids clf = Mean_Shift() clf.fit(X) centroids = clf.centroids plt.scatter(X[:, 0], X[:, 1], s=150) for c in centroids: plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150) plt.show()
#For Scikit learn codes and tutorial go to scikit learn blog https://abeeweeda.blogspot.com/2019/04/some-python-scikit-learn.html #regression import pandas as pd import quandl, math import numpy as np import pandas as pd from sklearn import preprocessing, cross_validation, svm from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from matplotlib import style import datetime style.use('ggplot') df = quandl.get("WIKI/GOOGL") df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) forecast_out = int(math.ceil(0.01 * len(df))) df['label'] = df[forecast_col].shift(-forecast_out) X = np.array(df.drop(['label'], 1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) clf = LinearRegression(n_jobs=-1) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) forecast_set = clf.predict(X_lately) df['Forecast'] = np.nan last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] df['Adj. Close'].plot() df['Forecast'].plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') plt.show() #KNN manual import numpy as np from math import sqrt import warnings from collections import Counter import pandas as pd import random def k_nearest_neighbors(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to a value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances)[:k]] #print(votes) #print('------------------', Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] confidence = Counter(votes).most_common(1)[0][1] / k return vote_result, confidence df = pd.read_csv(r'C:\Users\avishek\Downloads\DataFiles\B_C.csv') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) full_data = df.astype(float).values.tolist() random.shuffle(full_data) test_size = 0.4 train_set = {2: [], 4: []} test_set = {2: [], 4: []} train_data = full_data[:-int(test_size * len(full_data))] test_data = full_data[-int(test_size * len(full_data)):] for i in train_data: train_set[i[-1]].append(i[:-1]) for i in test_data: test_set[i[-1]].append(i[:-1]) correct = 0 total = 0 for group in test_set: for data in test_set[group]: vote, confidence = k_nearest_neighbors(train_set, data, k=5) if group == vote: correct += 1 total += 1 print('Accuracy:', correct / total) #nearest neighbour google manual from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from scipy.spatial import distance import random def euc(a,b): return distance.euclidean(a,b) iris = datasets.load_iris() X = iris.data y = iris.target class MyKnn(): def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train def predict(self, X_test): self.X_test = X_test prediction = [] for row in X_test: label = self.closest(row) prediction.append(label) return prediction def closest(self, row): distance = euc(row, self.X_train[0]) index = 0 for i in range(1, len(self.X_train)): dis = euc(row, X_train[i]) if dis < distance: distance = dis index = i return self.y_train[index] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5) classifier = MyKnn() classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) print(accuracy_score(y_test, prediction)) # SVM manual import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') class Support_Vector_Machine: def __init__(self, visualization=True): self.visualization = visualization self.colors = {1: 'r', -1: 'b'} if self.visualization: self.fig = plt.figure() self.ax = self.fig.add_subplot(1, 1, 1) # train def fit(self, data): self.data = data # { ||w||: [w,b] } opt_dict = {} transforms = [[1, 1], [-1, 1], [-1, -1], [1, -1]] all_data = [] for yi in self.data: for featureset in self.data[yi]: for feature in featureset: all_data.append(feature) self.max_feature_value = max(all_data) self.min_feature_value = min(all_data) all_data = None # support vectors yi(xi.w+b) = 1 step_sizes = [self.max_feature_value * 0.1, self.max_feature_value * 0.01, # point of expense: self.max_feature_value * 0.001, ] # extremely expensive b_range_multiple = 2 # we dont need to take as small of steps # with b as we do w b_multiple = 5 latest_optimum = self.max_feature_value * 10 for step in step_sizes: w = np.array([latest_optimum, latest_optimum]) # we can do this because convex optimized = False while not optimized: for b in np.arange(-1 * (self.max_feature_value * b_range_multiple), self.max_feature_value * b_range_multiple, step * b_multiple): for transformation in transforms: w_t = w * transformation found_option = True # weakest link in the SVM fundamentally # SMO attempts to fix this a bit # yi(xi.w+b) >= 1 # # #### add a break here later.. for i in self.data: for xi in self.data[i]: yi = i if not yi * (np.dot(w_t, xi) + b) >= 1: found_option = False # print(xi,':',yi*(np.dot(w_t,xi)+b)) if found_option: opt_dict[np.linalg.norm(w_t)] = [w_t, b] if w[0] < 0: optimized = True print('Optimized a step.') else: w = w - step norms = sorted([n for n in opt_dict]) # ||w|| : [w,b] opt_choice = opt_dict[norms[0]] self.w = opt_choice[0] self.b = opt_choice[1] latest_optimum = opt_choice[0][0] + step * 2 for i in self.data: for xi in self.data[i]: yi = i print(xi, ':', yi * (np.dot(self.w, xi) + self.b)) def predict(self, features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features), self.w) + self.b) if classification != 0 and self.visualization: self.ax.scatter(features[0], features[1], s=200, marker='*', c=self.colors[classification]) return classification def visualize(self): [[self.ax.scatter(x[0], x[1], s=100, color=self.colors[i]) for x in data_dict[i]] for i in data_dict] # hyperplane = x.w+b # v = x.w+b # psv = 1 # nsv = -1 # dec = 0 def hyperplane(x, w, b, v): return (-w[0] * x - b + v) / w[1] datarange = (self.min_feature_value * 0.9, self.max_feature_value * 1.1) hyp_x_min = datarange[0] hyp_x_max = datarange[1] # (w.x+b) = 1 # positive support vector hyperplane psv1 = hyperplane(hyp_x_min, self.w, self.b, 1) psv2 = hyperplane(hyp_x_max, self.w, self.b, 1) self.ax.plot([hyp_x_min, hyp_x_max], [psv1, psv2], 'k') # (w.x+b) = -1 # negative support vector hyperplane nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1) nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1) self.ax.plot([hyp_x_min, hyp_x_max], [nsv1, nsv2], 'k') # (w.x+b) = 0 # positive support vector hyperplane db1 = hyperplane(hyp_x_min, self.w, self.b, 0) db2 = hyperplane(hyp_x_max, self.w, self.b, 0) self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2], 'y--') plt.show() data_dict = {-1: np.array([[1, 7], [2, 8], [3, 8], ]), 1: np.array([[5, 1], [6, -1], [7, 3], ])} svm = Support_Vector_Machine() svm.fit(data=data_dict) predict_us = [[0, 10], [1, 3], [3, 4], [3, 5], [5, 5], [5, 6], [6, -5], [5, 8]] for p in predict_us: svm.predict(p) svm.visualize() #k-means import matplotlib.pyplot as plt from matplotlib import style import numpy as np from sklearn.cluster import KMeans style.use('ggplot') #k-means sklearn X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]]) #plt.scatter(X[:, 0],X[:, 1], s=80, linewidths = 2, zorder = 5) #plt.show() #print(X[:, 0]) clf = KMeans(n_clusters=2) clf.fit(X) centroid = clf.cluster_centers_ labels = clf.labels_ print(centroid) print('-----------') print(labels) colors = ['r.', 'g.'] for i in range(len(X)): plt.plot(X[i, 0], X[i, 1], colors[labels[i]]) plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', s=150) plt.show() #k-means manual import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np X = np.array([[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6], [9,11]]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10*["g","r","c","b","k"] class K_Means: def __init__(self, k=2, tol=0.001, max_iter=300): self.k = k self.tol = tol self.max_iter = max_iter def fit(self, data): self.centroids = {} for i in range(self.k): self.centroids[i] = data[i] for i in range(self.max_iter): self.classifications = {} for i in range(self.k): self.classifications[i] = [] for featureset in data: distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) self.classifications[classification].append(featureset) prev_centroids = dict(self.centroids) for classification in self.classifications: self.centroids[classification] = np.average(self.classifications[classification], axis=0) optimized = True for c in self.centroids: original_centroid = prev_centroids[c] current_centroid = self.centroids[c] if np.sum((current_centroid - original_centroid) / original_centroid * 100.0) > self.tol: print(np.sum((current_centroid - original_centroid) / original_centroid * 100.0)) optimized = False if optimized: break def predict(self,data): distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) return classification clf = K_Means() clf.fit(X) for centroid in clf.centroids: plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) plt.show() #k-means mine similar to sentdex but written by me import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np import operator X = np.array([[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6], [9,11]]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10*["g","r","c","b","k"] class k_means(): def __init__(self, k=2, tol=.001, max_iter=300): self.k = k self.tol = tol self.max_iter = max_iter def fit(self, Data): self.centroids = {} for i in range(self.k): self.centroids[i] = Data[i] self.classifications = {} for i in range(self.max_iter): for i in range(self.k): self.classifications[i] = [] for feature in Data: distance = [] for centroid in self.centroids: distance.append(np.linalg.norm(feature-self.centroids[centroid])) min_distance_index = distance.index(min(distance)) #if feature not in self.classifications[min_distance_index]: self.classifications[min_distance_index].append(feature) #print(self.classifications) prev_centroids = dict(self.centroids) for i in self.classifications: self.centroids[i] = np.average(self.classifications[i], axis=0) for i in self.centroids: self.tolerance = [] current_centroid = self.centroids[i] prev_centroid = prev_centroids[i] self.tolerance.append(np.sum((current_centroid - prev_centroid) / prev_centroid * 100.0)) optimized = False if sum(self.tolerance) <= self.k*self.tol: optimized = True if optimized: break clf = k_means() clf.fit(X) #print(clf.classifications) #print(clf.centroids) for centroid in clf.centroids: plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) plt.show() #mean shift manual import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11], [8, 2], [10, 2], [9, 3], ]) ##plt.scatter(X[:,0], X[:,1], s=150) ##plt.show() colors = 10 * ["g", "r", "c", "b", "k"] class Mean_Shift: def __init__(self, radius=4): self.radius = radius def fit(self, data): centroids = {} for i in range(len(data)): centroids[i] = data[i] while True: new_centroids = [] for i in centroids: in_bandwidth = [] centroid = centroids[i] for featureset in data: if np.linalg.norm(featureset - centroid) < self.radius: in_bandwidth.append(featureset) new_centroid = np.average(in_bandwidth, axis=0) new_centroids.append(tuple(new_centroid)) uniques = sorted(list(set(new_centroids))) prev_centroids = dict(centroids) centroids = {} for i in range(len(uniques)): centroids[i] = np.array(uniques[i]) optimized = True for i in centroids: if not np.array_equal(centroids[i], prev_centroids[i]): optimized = False if not optimized: break if optimized: break self.centroids = centroids clf = Mean_Shift() clf.fit(X) centroids = clf.centroids plt.scatter(X[:, 0], X[:, 1], s=150) for c in centroids: plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150) plt.show()
Comments
Post a Comment