接上一个博文的环境 使用的是官网的专利使用数据,这里只截取了一部分
3858241,956203 3858241,1324234 3858241,3398406 3858241,3557384 3858241,3634889 3858242,1515701 3858242,3319261 3858242,3668705 3858242,3707004 3858243,2949611 3858243,3146465 3858243,3156927 3858243,3221341 3858243,3574238 3858243,3681785 3858243,3684611 3858244,14040 3858244,17445mapper.py
1 #!/usr/bin/env python 2 import sys 3 list1=[] 4 for line in sys.stdin: 5 line=line.strip() 6 words = line.split("\n") 7 list1.append(words[0]) 8 for x in xrange(len(list1)): 9 print list1[x]reducer.py
1 #!/usr/bin/env python 2 from operator import itemgetter 3 import sys 4 dick1={} 5 for line in sys.stdin: 6 words = line.split("\n") 7 if words[0][0]=='\t': 8 continue 9 else: 10 11 word =words[0].split(",") 12 # print word[1] 13 # print "%s\t%s" % (word[0],word[1]) 14 if str(word[0]) in dick1: 15 dick1[word[0]].append(int((word[1].split("\t"[0]))[0])) 16 else: 17 dick1[word[0]]=[int((word[1].split("\t"[0]))[0])] 18 for one in dick1: 19 dick1[one].sort() #进行单个键内对多个值的排序 20 dick2=sorted(dick1.items(), key=lambda d:d[0], reverse = False) #进行键的排序 21 for one in dick2: 22 print one输出结果
[hadoop@mylab-client py]$ hadoop fs -cat out/part-00000 ('3858241', [956203, 1324234, 3398406, 3557384, 3634889]) ('3858242', [1515701, 3319261, 3668705, 3707004]) ('3858243', [2949611, 3146465, 3156927, 3221341, 3574238, 3681785, 3684611]) ('3858244', [14040, 17445, 2211676]) [hadoop@mylab-client py]$再进行格式化输出即可