import pandas
as pd
inputfile =
'../data/consumption_data.xls'
outputfile =
'../tmp/data_type.xls'
k =
3
iteration =
500
data = pd.read_excel(inputfile, index_col =
'Id')
data_zs =
1.0*(data - data.mean())/data.std()
from sklearn.cluster
import KMeans
model = KMeans(n_clusters = k, n_jobs =
4, max_iter = iteration)
model.fit(data_zs)
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.DataFrame(model.cluster_centers_)
r = pd.concat([r2, r1], axis =
1)
r.columns = list(data.columns) + [
u'类别数目']
print(r)
r = pd.concat([data, pd.Series(model.labels_, index = data.index)], axis =
1)
r.columns = list(data.columns) + [
u'聚类类别']
r.to_excel(outputfile)
def density_plot(data):
import matplotlib.pyplot
as plt
plt.rcParams[
'font.sans-serif'] = [
'SimHei']
plt.rcParams[
'axes.unicode_minus'] =
False
p = data.plot(kind=
'kde', linewidth =
2, subplots =
True, sharex =
False)
[p[i].set_ylabel(
u'密度')
for i
in range(k)]
plt.legend()
return plt
pic_output =
'../tmp/pd_'
for i
in range(k):
density_plot(data[r[
u'聚类类别']==i]).savefig(
u'%s%s.png' %(pic_output, i))
这是一个很常见的案列的代码,但是我在跑的时候疯狂报错,错误信息如下
ImportError: [joblib] Attempting
to do parallel computing
without protecting your import
on a system
that does not support forking. To use parallel-computing
in a
script, you must protect your main loop using
"if __name__ == '__main__'". Please see
the joblib documentation
on Parallel
for more information
大概就是一个并行计算的错误,因为案例里建立模型时,定义了n_jobs=4,所以报错,但是为什么会产生这个错误我不太清楚,可能是在python3.6版本里原先的库有的地方不兼容吧。
解决方案:
将n_jobs=4删掉就好