4-3 数据离散化(无error版本)

xiaoxiao2021-02-27  554

#-*- coding: utf-8 -*- #数据规范化 import pandas as pd if __name__=="__main__": datafile = '../data/discretization_data.xls' #参数初始化 data = pd.read_excel(datafile) #读取数据 data = data[u'肝气郁结证型系数'].copy() k = 4 d1 = pd.cut(data, k, labels = range(k)) #等宽离散化,各个类比依次命名为0,1,2,3 #等频率离散化 w = [1.0*i/k for i in range(k+1)] w = data.describe(percentiles = w)[4:4+k+1] #使用describe函数自动计算分位数 w[0] = w[0]*(1-1e-10) d2 = pd.cut(data, w, labels = range(k)) from sklearn.cluster import KMeans #引入KMeans kmodel = KMeans(n_clusters = k, n_jobs = 4) #建立模型,n_jobs是并行数,一般等于CPU数较好 kmodel.fit(data.values.reshape((len(data), 1))) #训练模型 c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) #输出聚类中心,并且排序(默认是随机序的) w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点 w = [0] + list(w[0]) + [data.max()] #把首末边界点加上 d3 = pd.cut(data, w, labels = range(k)) def cluster_plot(d, k): #自定义作图函数来显示聚类结果 import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 plt.figure(figsize = (8, 3)) for j in range(0, k): plt.plot(data[d==j], [j for i in d[d==j]], 'o') plt.ylim(-0.5, k-0.5) return plt cluster_plot(d1, k).show() cluster_plot(d2, k).show() cluster_plot(d3, k).show()

使用python3.5,数据集(discretization_data.xls)为:

肝气郁结证型系数0.0560.4880.1070.3220.2420.3890.2460.3300.2570.2050.3300.2350.2670.2810.1840.2710.1000.1730.3020.1760.1720.1950.2810.2450.1560.1680.2110.2550.2790.3410.2300.2660.2520.2270.2770.3290.3200.0530.1520.2690.0420.1790.2390.1670.2090.4320.3540.2470.3280.2150.4330.2940.2370.2440.1990.2860.2190.2610.3160.2210.3260.2840.2420.2680.2570.1740.2510.2370.2980.2880.2630.2730.2330.2430.2990.2020.2290.3480.3690.1860.2700.3020.2160.3200.1670.2280.1080.2530.3030.2740.2250.1800.1950.2190.1690.2340.2790.1140.2250.1690.1750.1700.2230.1660.2170.2560.2520.2990.2470.3440.2720.1590.2190.2400.1870.2780.2270.2800.2850.3880.1340.2300.1950.2200.4480.3230.1040.1020.1960.1760.2290.1990.2780.2410.2070.1630.2530.2220.2770.2900.2110.1860.4180.1750.2380.1600.1980.3290.1800.2420.2870.2160.3180.3080.2110.1290.2850.1390.2180.1770.2090.3860.1370.1930.3020.3500.3230.1020.1980.1100.1730.4170.1230.1090.1110.1660.3030.1280.1570.1340.3140.2880.1070.2950.3000.1710.0570.5040.1020.3160.2510.3830.2430.3230.2710.2000.3460.2510.2660.2950.2000.2670.1000.1840.2940.1740.1880.1930.2910.2640.1680.1690.2080.2670.2950.3460.2350.2690.2590.2280.2870.3390.3240.0540.1590.2730.0550.1810.2380.1640.2100.4320.3630.2470.3300.2110.4270.2960.2470.2570.2120.2850.2160.2710.3280.2190.3320.2940.2460.2820.2720.1730.2500.2420.3130.2960.2740.2810.2370.2490.3030.1930.2270.3470.3730.1970.2760.3040.2350.3250.1660.2260.1150.2530.3060.2770.2230.1760.2050.2380.1730.2430.2920.1220.2240.1770.1760.1830.2290.1580.2210.2500.2520.3080.2470.3550.2820.1550.2260.2470.1900.2820.2280.2880.2960.3890.1400.2310.1980.2350.4560.3220.1080.1100.2050.1790.2450.2060.2870.2540.2150.1550.2540.2270.2800.3050.2140.1880.4200.1770.2540.1690.1930.3450.1740.2530.2890.2320.3240.3180.2160.1420.2810.1440.2170.1830.2170.4000.1470.1910.3090.3450.3190.1100.2100.1150.1850.4350.1380.1190.1110.1730.3100.1320.1680.1340.3290.3030.1040.3000.2980.1870.0340.4740.0850.3040.2310.3770.2260.3040.2470.1840.3250.2180.2580.2730.1810.2630.0830.1690.2880.1580.1610.1860.2580.2330.1460.1490.1950.2470.2750.3260.2130.2510.2330.2090.2530.3080.3030.0420.1350.2530.0260.1580.2290.1580.1980.4110.3370.2260.3120.1910.4170.2830.2190.2390.1910.2750.2040.2470.3120.2080.3180.2700.2290.2510.2500.1550.2280.2190.2930.2700.2430.2650.2140.2300.2770.1820.2150.3410.3660.1700.2510.2740.2160.3050.1610.2120.0910.2350.2790.2500.1990.1690.1860.2100.1490.2140.2540.1060.2130.1570.1660.1630.2020.1440.2070.2400.2320.2900.2260.3280.2450.1400.1980.2320.1660.2570.2020.2660.2660.3660.1260.2060.1750.2150.4290.2980.0940.0870.1800.1550.2160.1750.2670.2330.1930.1510.2380.1980.2580.2700.1880.1730.3970.1660.2170.1450.1780.3260.1730.2390.2770.2160.2980.2940.1900.1220.2730.1250.2030.1620.2040.3680.1180.1650.2890.3240.3120.0920.1950.0890.1610.4000.1180.1010.0950.1640.2880.1240.1470.1220.3060.2840.0870.2720.2860.1510.0450.4870.0890.3040.2320.3750.2290.3170.2480.1930.3170.2260.2540.2770.1840.2630.0890.1690.2900.1650.1630.1800.2680.2400.1500.1540.2050.2520.2700.3280.2150.2540.2430.2110.2690.3190.3080.0450.1440.2620.0330.1730.2290.1580.2020.4170.3420.2360.3170.2060.4240.2790.2330.2410.1930.2760.2070.2520.3040.2160.3140.2640.2280.2630.2490.1630.2420.2230.2890.2780.2600.2600.2240.2320.2920.1910.2120.3330.3620.1730.2630.2910.2140.3120.1560.2100.1050.2380.2890.2640.2080.1650.1790.2180.1550.2280.2660.1050.2100.1630.1590.1630.2140.1510.2150.2380.2450.2920.2430.3370.2620.1470.2110.2340.1770.2720.2100.2650.2690.3750.1250.2180.1830.2120.4320.3110.0940.0920.1880.1570.2170.1910.2700.2380.1980.1440.2370.2090.2710.2850.2060.1790.4110.1650.2320.1430.1840.3260.1660.2330.2750.2160.3090.2990.1970.1200.2680.1290.2110.1660.2000.3830.1340.1750.2920.3370.3140.0930.1950.0970.1620.4080.1170.0990.0980.1620.2940.1260.1450.1300.3040.2830.1030.2830.2870.1610.0490.4880.0980.3130.2370.3810.2390.3230.2540.1950.3260.2330.2590.2770.1840.2660.0980.1710.2930.1710.1710.1880.2760.2440.1530.1640.2080.2530.2770.3360.2210.2560.2470.2170.2730.3270.3170.0490.1460.2680.0410.1770.2320.1630.2020.4260.3440.2430.3220.2070.4250.2840.2360.2420.1930.2830.2130.2560.3130.2170.3230.2740.2360.2640.2540.1670.2460.2330.2930.2850.2630.2670.2250.2370.2940.1920.2190.3410.3660.1780.2640.2930.2160.3170.1630.2190.1060.2470.2970.2680.2170.1720.1880.2180.1630.2310.2720.1070.2170.1660.1670.1660.2170.1560.2170.2480.2470.2930.2440.3410.2640.1530.2160.2360.1830.2770.220.2730.2770.3830.1330.2260.1930.2180.4390.3170.0980.0980.1920.1660.2270.1940.2780.2380.2050.1530.2470.2160.2730.2890.2070.1830.4150.1710.2370.1520.1920.3270.1730.2390.2840.2160.3110.3020.2060.1270.2770.1350.2140.1730.2050.3840.1340.1840.2940.3410.3170.0980.1950.1020.1720.4150.1220.1060.1070.1650.3020.1270.1520.1310.3110.2840.1030.2870.2960.169

转载请注明原文地址: https://www.6miu.com/read-145.html

最新回复(0)