NLTK07《Python自然语言处理》code06 学习分类文本

xiaoxiao2021-02-28  107

学习分类文本

# -*- coding: utf-8 -*- # win10 python3.5.3/python3.6.1 nltk3.2.4 # 《Python自然语言处理》 06 学习分类文本 # pnlp06.py 本部分代码有几个训练比较耗时,大约需要60+分钟 # 6.1 监督式分类 def gender_features(word): return {'last_letter':word[-1]} res = gender_features('Shrek') print(res) # {'last_letter': 'k'} from nltk.corpus import names import random import nltk names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) featuresets = [(gender_features(n), g) for (n, g) in names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) res = classifier.classify(gender_features('Neo')) print(res) # male res = classifier.classify(gender_features('Trinity')) print(res) # female res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.73 classifier.show_most_informative_features(5) # Most Informative Features # last_letter = 'a' female : male = 38.1 : 1.0 # last_letter = 'k' male : female = 30.9 : 1.0 # last_letter = 'f' male : female = 17.4 : 1.0 # last_letter = 'p' male : female = 11.9 : 1.0 # last_letter = 'v' male : female = 10.6 : 1.0 from nltk.classify import apply_features train_set = apply_features(gender_features, names[500:]) test_set = apply_features(gender_features, names[:500]) # 选择正确的特征 def gender_features2(name): features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopgrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return features res = gender_features2('John') print(res) # {'firstletter': 'j', 'lastletter': 'n', 'count(a)': 0,... featuresets = [(gender_features2(n), g) for (n, g) in names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.762 # 训练集、开发测试集、测试集 train_names = names[1500:] devtest_names = names[500:1500] test_names = names[:500] train_set = [(gender_features(n), g) for (n, g) in train_names] devtest_set = [(gender_features(n), g) for (n, g) in devtest_names] test_set = [(gender_features(n), g) for (n, g) in test_names] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, devtest_set) print(res) # 0.753 errors = [] for (name, tag) in devtest_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name)) for (tag, guess, name) in sorted(errors): print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)) # correct=female guess=male name=Abigael # correct=female guess=male name=Adriaens # ... def gender_featuress(word): return {'suffix1':word[-1:], 'suffix2':word[-2:]} train_set = [(gender_features(n), g) for (n, g) in train_names] devtest_set = [(gender_features(n), g) for (n, g) in devtest_names] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, devtest_set) print(res) # 0.771 # 文档分类 import random, nltk from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words.keys())[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features res = document_features(movie_reviews.words('pos/cv957_8737.txt')) print(res) # {'contains(plot)': True, 'contains(:)': True, ... featuresets = [(document_features(d), c) for (d, c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.79 classifier.show_most_informative_features(5) # Most Informative Features # contains(martian) = True neg : pos = 7.7 : 1.0 # contains(atrocious) = True neg : pos = 7.1 : 1.0 # contains(unimaginative) = True neg : pos = 7.1 : 1.0 # contains(turkey) = True neg : pos = 6.8 : 1.0 # contains(schumacher) = True neg : pos = 6.7 : 1.0 # 词性标注 import nltk from nltk.corpus import brown suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist[word[-1:]] += 1 suffix_fdist[word[-2:]] += 1 suffix_fdist[word[-3:]] += 1 common_suffixes = list(suffix_fdist.keys())[:100] print(common_suffixes) # ['e', 'he', 'the', 'n', 'on', 'ton', 'y', 'ty',... def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.DecisionTreeClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.5689706613625062 res = classifier.classify(pos_features('cats')) print(res) # NNS print(classifier.pseudocode(depth=4)) """ if endswith(the) == False: if endswith(,) == False: if endswith(s) == False: if endswith(.) == False: return '.' if endswith(.) == True: return '.' if endswith(s) == True: if endswith(was) == False: return 'PP$' if endswith(was) == True: return 'BEDZ' if endswith(,) == True: return ',' if endswith(the) == True: return 'AT' """ # 探索上下文语境 def pos_features(sentence, i): features = {"suffix(1)":sentence[i][-1:], "suffix(2)":sentence[i][-2:], "suffix(3)":sentence[i][-3:]} if i == 0: features["prev-word"] = "<START>" else: features["prev-word"] = sentence[i-1] return features res = pos_features(brown.sents()[0], 8) print(res) # {'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'} tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.7891596220785678 # 序列分类 def pos_features(sentence, i, history): features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:]} if i == 0: features["prev-word"] = "<STRAT>" features["prev-tag"] = "<START>" else: features["prev-word"] = sentence[i-1] features["prev-tag"] = sentence[i-1] return features import nltk from nltk.corpus import brown class ConsecutivePosTagger(nltk.TaggerI): def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = pos_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) self.classifier = nltk.NaiveBayesClassifier.train(train_set) def tag(self, sentence): history = [] for i, word in enumerate(sentence): featureset = pos_features(sentence, i, history) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history) tagged_sents = brown.tagged_sents(categories='news') size = int(len(tagged_sents)*0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] tagger = ConsecutivePosTagger(train_sents) res = tagger.evaluate(test_sents) print(res) # 0.7965693092257765 # 其他序列分类方法 # 6.2 监督式分类的举例 # 句子分类 import nltk sents = nltk.corpus.treebank_raw.sents() tokens = [] boundaries = set() offset = 0 for sent in nltk.corpus.treebank_raw.sents(): tokens.extend(sent) offset += len(sent) boundaries.add(offset - 1) def punct_features(tokens, i): return {'next-word-capitalized': tokens[i+1][0].isupper(), 'prevword': tokens[i-1].lower(), 'punct': tokens[i], 'prev-word-is-one-char': len(tokens[i-1])==1} featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens) - 1) if tokens[i] in '.?!'] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.936026936026936 def segment_sentences(words): start = 0 sents = [] for i, word in words: if word in '.?!' and classifier.classify(punct_features(words, i)) == True: sents.append(words[start:i+1]) start = i + 1 if start < len(words): sents.append(words[start:]) return sents # 识别对话行为类型 posts = nltk.corpus.nps_chat.xml_posts()[:10000] def dialogure_act_features(post): features = {} for word in nltk.word_tokenize(post): features['contains(%s)' % word.lower()] = True return features featuresets = [(dialogure_act_features(post.text), post.get('class')) for post in posts] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) res = nltk.classify.accuracy(classifier, test_set) print(res) # 0.668 # 识别文字蕴涵(Recognizing textual entailment, RTE) def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) print(extractor.text_words) # {'four', 'republics', 'association', 'Iran', 'Co', ... print(extractor.hyp_words) # {'SCO.', 'China', 'member'} print(extractor.overlap('word')) # set() print(extractor.overlap('ne')) # {'China'} print(extractor.hyp_extra('word')) # {'member'} # 扩展到大型数据集 # 6.3 评估 # 测试集 import random, nltk from nltk.corpus import brown tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents)*0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] file_ids = brown.fileids(categories='news') size = int(len(file_ids)*0.1) train_set = brown.tagged_sents(file_ids[size:]) test_set = brown.tagged_sents(file_ids[:size]) train_set = brown.tagged_sents(categories='news') test_set = brown.tagged_sents(categories='fiction') # 精确度 # classifier = nltk.NaiveBayesClassifier.train(train_set) # print('Accuracy: %4.2f' % nltk.classify.accuracy(classifier, test_set)) # 精确度和召回率 # 精确度(Precision),表示发现的项目中有多少是相关的 # 召回率(Recall),表示相关的项目中发现了多少 # F-度量值(F-Measure|F-Score),(2*Precision*Recall)/(Precision+Recall) # 混淆矩阵 def tag_list(tagged_sents): return [tag for sent in tagged_sents for (word, tag) in sent] def apply_tagger(tagger, corpus): return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus] sents = brown.tagged_sents(categories='editorial') t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(sents, backoff=t0) t2 = nltk.BigramTagger(sents, backoff=t1) gold = tag_list(sents) test = tag_list(apply_tagger(t2, sents)) cm = nltk.ConfusionMatrix(gold, test) print(cm) # 交叉验证 # 6.4 决策树 # 商和信息增益 import math def entropy(labels): freqdist = nltk.FreqDist(labels) probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)] return -sum([p * math.log(p, 2) for p in probs]) print(entropy(['male', 'male', 'male', 'male'])) # -0.0 print(entropy(['male', 'female', 'male', 'male'])) # 0.8112781244591328 print(entropy(['female', 'male', 'female', 'male'])) # 1.0 print(entropy(['female', 'female', 'male', 'female'])) # 0.8112781244591328 print(entropy(['female', 'female', 'female', 'female'])) # -0.0 # 6.5 朴素贝叶斯分类器 # 潜在概率模型 # 零计数和平滑 # 非二元特征 # 独立的朴素贝叶斯 # 双重计数的原因 # 6.6 最大熵分类器 # 6.7 为语言模式建模
转载请注明原文地址: https://www.6miu.com/read-43426.html

最新回复(0)