데이터 셋 생성
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=50, centers=1, cluster_std=0.5, random_state=42)
#outliers
outliers = np.array([[3, 3], [3, -3], [-3, 3], [-3, -3]])
X_with_outliers = np.vstack([X, outliers])
#data view
plt.scatter(X_with_outliers[:, 0], X_with_outliers[:, 1], c='blue', s=50)
plt.scatter(outliers[:, 0], outliers[:, 1], c='red', s=100, marker='x')
plt.title('Data with Outliers')
plt.xlabel('X1')
plt.ylabel('X2')
plt.grid(True)
plt.savefig('./graph.png')
K-means
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X, _ = make_blobs(n_samples=50, centers=1, cluster_std=0.5, random_state=42)
#outliers
outliers = np.array([[3, 3], [3, -3], [-3, 3], [-3, -3]])
X_with_outliers = np.vstack([X, outliers])
kmeans = KMeans(n_clusters=1, random_state=42)
kmeans.fit(X_with_outliers)
kmeans_centroid = kmeans.cluster_centers_
#data view
plt.scatter(X_with_outliers[:, 0], X_with_outliers[:, 1], c='blue', s=50)
plt.scatter(outliers[:, 0], outliers[:, 1], c='red', s=100, marker='x')
plt.scatter(kmeans_centroid[:, 0], kmeans_centroid[:, 1], c='green', s=200, marker='*')
plt.title('K-means Centroid with Outliers')
plt.xlabel('X1')
plt.ylabel('X2')
plt.grid(True)
plt.savefig('./graph1.png')
kmeans_centroid
PAM
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X, _ = make_blobs(n_samples=50, centers=1, cluster_std=0.5, random_state=42)
#outliers
outliers = np.array([[3, 3], [3, -3], [-3, 3], [-3, -3]])
X_with_outliers = np.vstack([X, outliers])
#functions
def manhattan_distance(point1, point2):
return np.sum(np.abs(np.array(point1) - np.array(point2)))
def assign_to_clusters(data, medoids):
distances = np.array([[manhattan_distance(dp, medoid) for medoid in medoids] for dp in data])
return np.argmin(distances, axis=1)
def compute_cost(data, medoids, clusters):
return np.sum([manhattan_distance(data[i], medoids[clusters[i]]) for i in range(len(data))])
def pam_clustering(data, initial_medoids):
medoids = initial_medoids
clusters = assign_to_clusters(data, medoids)
current_cost = compute_cost(data, medoids, clusters)
history = [(medoids, clusters, current_cost)]
changed = True
while changed:
changed = False
for data_idx in range(len(data)):
for medoid_idx in range(len(medoids)):
if not np.array_equal(data[data_idx], medoids[medoid_idx]):
new_medoids = np.array(medoids)
new_medoids[medoid_idx] = data[data_idx]
new_clusters = assign_to_clusters(data, new_medoids)
new_cost = compute_cost(data, new_medoids, new_clusters)
if new_cost < current_cost:
current_cost = new_cost
clusters = new_clusters
medoids = np.array(new_medoids)
changed = True
history.append((medoids, clusters, current_cost))
return history
#pam calc
initial_medoid_pam = np.array([X_with_outliers[0]])
pam_history_pam = pam_clustering(X_with_outliers, initial_medoid_pam)
final_medoid_pam = pam_history_pam[-1][0]
#data view
plt.scatter(X_with_outliers[:, 0], X_with_outliers[:, 1], c='blue', s=50)
plt.scatter(outliers[:, 0], outliers[:, 1], c='red', s=100, marker='x')
plt.scatter(final_medoid_pam[:, 0], final_medoid_pam[:, 1], c='purple', s=200, marker='^')
plt.title('PAM Medoid with Outliers')
plt.xlabel('X1')
plt.ylabel('X2')
plt.grid(True)
plt.savefig('./graph2.png')
final_medoid_pam
'Develop > Machine_Learning' 카테고리의 다른 글
[Machine_learning]Elastic Net regression (1) | 2023.10.16 |
---|---|
[Machine_Learning]OLS, Ridge, Lasso regression code (0) | 2023.10.16 |
[Machine_Learning]Support Vector Machine(SVM) (0) | 2023.09.19 |
[Machine_Learning]Logistic Regression (0) | 2023.09.19 |
[Machine_Learning]Gini index와 Gini Gain (0) | 2023.09.06 |