python k-means 一堆乱七八糟的程序
2021-05-28 08:02
标签:nsf sklearn div pairs load size mod uniq put python k-means F:\PythonProject\K-Means ########################33 python k-means 一堆乱七八糟的程序 标签:nsf sklearn div pairs load size mod uniq put 原文地址:https://www.cnblogs.com/herd/p/14785086.htmlimport pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
threshold_value = 0.85
def main():
# load data
df_wine = pd.read_csv(‘d_1.txt‘, header=None) # 本地加载
df_wine2 = pd.read_csv(‘f_1.txt‘, header=None) # 本地加载
# split the data,train:test=7:3
#x, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
#print(df_wine.iloc[:, 2:].values)
#print(df_wine.iloc[:, 1:2].values)
#print(df_wine.iloc[:, 0:1].values)
#x,y,z = df_wine.iloc[:, 2:].values, df_wine.iloc[:, 1:2].values, df_wine.iloc[:, 0:1].values
x=df_wine.iloc[:, 2:].values
y=df_wine.iloc[:, 1].values
z_frame=df_wine.iloc[:, 0:2].values
z_frame_f = df_wine2.iloc[:, 0:2].values
label_name_f = df_wine2.iloc[:, 2].values
list_len = 20
x=x[0:list_len]
y=y[0:list_len]
z_frame=z_frame[0:list_len]
#z_frame_f=z_frame_f[0:list_len]
#label_name_f=label_name_f[0:list_len]
#print(z_frame)
#print("-------------------------------------------")
#print(z_frame_f)
#print("{0} {1}".format(x,y))
print("{0} {1}".format(len(x),len(y)))
#print(x)
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=0)
x_train = x[0:len(x)-5]
y_train = y[0:len(y)-5]
x_test = x[0:5]
y_test = y[0:5]
print(len(x_train))
print(x_train)
print("----------------------------------------")
#print(y_train)
# standardize the feature 标准化单位方差
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.fit_transform(x_test)
#print(x_train_std)
print(len(x_train_std))
# 构造协方差矩阵,得到特征向量和特征值
cov_matrix = np.cov(x_train_std.T)
eigen_val, eigen_vec = np.linalg.eig(cov_matrix)
# print("values\n ", eigen_val, "\nvector\n ", eigen_vec)
print(len(eigen_val))
print(len(eigen_vec))
# 解释方差比
tot = sum(eigen_val) # 总特征值和
var_exp = [(i / tot) for i in sorted(eigen_val, reverse=True)] # 计算解释方差比,降序
#print(var_exp)
#[0.3516026271036254, 0.2154102386841404, 0.09449164581680554,
#0.0919054990988971, 0.08265939106635344, 0.055431032435754,
#0.04012443059852082, 0.028756191609729642, 0.017827639508716207,
#0.011781879332959133, 0.008141811912227535, 0.0018676128322704462]
cum_var_exp = np.cumsum(var_exp) # 累加方差比率
print(cum_var_exp)
#[0.35789126 0.56364606 0.66236146 0.7537545 0.83350328 0.88822259
#0.93227841 0.96230417 0.9793677 0.99038737 0.9981856 1. ]
index_x0 = -1
for i in range(len(cum_var_exp)):
index_value = cum_var_exp[i]
if index_value >threshold_value:
index_x0 = i
break
print("PCA:",index_x0)
# 特征变换
eigen_pairs = [(np.abs(eigen_val[i]), eigen_vec[:, i]) for i in range(len(eigen_val))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True) # (特征值,特征向量)降序排列
eigen_pairs2 = np.array(eigen_pairs)
print(type(eigen_pairs))
print(type(eigen_pairs2))
print(len(eigen_pairs))
#print(eigen_pairs)
print("====================================")
#print(eigen_pairs[0][1])
#print(eigen_pairs[1][1][0:4])
output_matrix = x
X = np.array(output_matrix)
print("---------m----------------")
#print(eigen_pairs2[:,:2])
w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis])) # 降维投影矩阵W
#print("-------------------------")
#print(w)
x_train_pca = x_train_std.dot(w)
print("-------------------------")
#print(x_train_pca)
color = [‘r‘, ‘g‘, ‘b‘]
marker = [‘s‘, ‘x‘, ‘o‘]
for i, c, m in zip(np.unique(y_train), color, marker):
#print("{0} {1}".format(x_train_pca[y_train == i, 0],x_train_pca[y_train == i, 1]))
#print("{0} {1} {2}".format(len(x_train_pca[y_train == i, 0]),len(x_train_pca[y_train == i, 1]),len(y)))
plt.scatter(x_train_pca[y_train == i, 0],x_train_pca[y_train == i, 1],c=c, label=i, marker=m)
plt.title(‘Result‘)
plt.xlabel(‘PC1‘)
plt.ylabel(‘PC2‘)
plt.legend(loc=‘lower left‘)
plt.show()
#print("============================")
estimator = KMeans(n_clusters=3)#构造聚类器
#print(estimator.labels_)
estimator.fit(X)#聚类
label_pred = estimator.labels_ #获取聚类标签
center_p = estimator.cluster_centers_ #聚类中心
#print(estimator.labels_)
#print(y_train)
print("============聚类中心================")
print(center_p)
print("============================")
print(label_pred)
#print(X)
#绘制k-means结果
‘‘‘
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
‘‘‘
x0=[]
x1=[]
x2=[]
y0=[]
y1=[]
y2=[]
‘‘‘
for i in range(len(label_pred)):
if label_pred[i] == 0:
x0.append(X[i])
y0.append(y_train[i])
elif label_pred[i] == 1:
x1.append(X[i])
y1.append(y_train[i])
elif label_pred[i] == 2:
x2.append(X[i])
y2.append(y_train[i])
‘‘‘
for i in range(len(label_pred)):
if label_pred[i] == 0:
x0.append(X[i])
index_z = z_frame[i]
index_z_1 = index_z[0]
index_z_2 = index_z[1]
for m in range(len(z_frame_f)):
index_z_f = z_frame_f[m]
index_z_f_1 = index_z_f[0]
index_z_f_2 = index_z_f[1]
if index_z_f_1==index_z_1 and index_z_2==index_z_f_2:
index_name1 = label_name_f[m]
print("1 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1))
y0.append(index_name1)
elif label_pred[i] == 1:
x1.append(X[i])
index_z = z_frame[i]
index_z_1 = index_z[0]
index_z_2 = index_z[1]
for m in range(len(z_frame_f)):
index_z_f = z_frame_f[m]
index_z_f_1 = index_z_f[0]
index_z_f_2 = index_z_f[1]
if index_z_f_1==index_z_1 and index_z_2==index_z_f_2:
index_name1 = label_name_f[m]
print("2 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1))
y1.append(index_name1)
elif label_pred[i] == 2:
x2.append(X[i])
index_z = z_frame[i]
index_z_1 = index_z[0]
index_z_2 = index_z[1]
for m in range(len(z_frame_f)):
index_z_f = z_frame_f[m]
index_z_f_1 = index_z_f[0]
index_z_f_2 = index_z_f[1]
if index_z_f_1==index_z_1 and index_z_2==index_z_f_2:
index_name1 = label_name_f[m]
print("3 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1))
y2.append(index_name1)
print("=========================================")
#print(x0)
print("\n====1===")
print(y0)
print("====2===")
print(y1)
print("====3===")
print(y2)
x0=np.array(x0)
x1=np.array(x1)
x2=np.array(x2)
final_matrix = []
for i in range(len(y_train)):
#y_train[i] -=1
final_matrix.append(y_train[i])
final_matrix.append(label_pred[i])
final_matrix.append(x_train[i])
#print(final_matrix)
#print("{0} {1} \n {2} \n {3} \n".format(len(label_pred),len(y_train),label_pred,y_train))
print("\n\n\n\n\n============================")
print(label_pred)
print(y_train)
print("============================")
plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker=‘o‘, label=‘label0‘)
plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker=‘*‘, label=‘label1‘)
plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker=‘+‘, label=‘label2‘)
#plt.xlabel(‘petal length‘)
#plt.ylabel(‘petal width‘)
plt.legend(loc=2)
plt.show()
if __name__ == ‘__main__‘:
main()