可以使用Python中的pandas和matplotlib库来完成这些任务。下面是一个示例代码:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
# 1. 读取数据并绘制散点图
data = pd.read_csv('moon.csv')
plt.scatter(data['X1'], data['X2'], c=data['Label'])
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
# 2. 使用K-means进行聚类并生成新标签
kmeans = KMeans(n_clusters=2)
data['NewLabel'] = kmeans.fit_predict(data[['X1', 'X2']])
# 统计正确分类的比例
correct_count = sum(data['NewLabel'] == data['Label'])
accuracy = correct_count / len(data)
# 计算轮廓系数
silhouette_avg = silhouette_score(data[['X1', 'X2']], data['NewLabel'])
print("K-means - 正确分类比例:", accuracy)
print("K-means - 轮廓系数:", silhouette_avg)
# 3. 使用层次聚类进行聚类并生成新标签
agg_cluster = AgglomerativeClustering(n_clusters=2)
data['AggLabel'] = agg_cluster.fit_predict(data[['X1', 'X2']])
# 统计正确分类的比例
correct_count_agg = sum(data['AggLabel'] == data['Label'])
accuracy_agg = correct_count_agg / len(data)
# 计算轮廓系数
silhouette_avg_agg = silhouette_score(data[['X1', 'X2']], data['AggLabel'])
print("层次聚类 - 正确分类比例:", accuracy_agg)
print("层次聚类 - 轮廓系数:", silhouette_avg_agg)
# 4. 使用DBSCAN进行聚类并生成新标签
dbscan = DBSCAN(eps=0.3, min_samples=5)
data['DBScanLabel'] = dbscan.fit_predict(data[['X1', 'X2']])
# 统计正确分类的比例(-1表示噪音点,不参与统计)
correct_count_dbscan = sum((data['DBScanLabel'] != -1) & (data['DBScanLabel'] == data['Label']))
accuracy_dbscan = correct_count_dbscan / len(data[data['DBScanLabel'] != -1])
# 计算轮廓系数(-1表示噪音点,不参与计算)
silhouette_avg_dbscan = silhouette_score(data[['X1', 'X2']], data[data['DBScanLabel'] != -1]['DBScanLabel'])
print("DBSCAN - 正确分类比例:", accuracy_dbscan)
print("DBSCAN - 轮廓系数:", silhouette_avg_dbscan)
# 将新标签写入原始CSV文件
data.to_csv('moon.csv', index=False)
请注意,代码中的参数值可以根据数据集和需求进行调整。同时,在使用层次聚类和DBSCAN算法时,也需要根据具体情况设置合适的参数。
内容由零声教学AI助手提供,问题来源于学员提问