NLTK08《Python自然语言处理》code07 从文本提取信息

xiaoxiao2021-02-28 112

从文本提取信息

# -*- coding: utf-8 -*- # win10 python3.5.3/python3.6.1 nltk3.2.4 # 《Python自然语言处理》 07 从文本提取信息 # pnlp07.py # 7.1 信息提取 # 信息提取结构 import nltk def ie_preprocess(document): sentences = nltk.sent_tokenize(document) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] # 7.2 分块 # 名词短语分块 sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")] grammar = "NP: {<DT>?<JJ>*<NN>}" cp = nltk.RegexpParser(grammar) result = cp.parse(sentence) print(result) """ (S (NP the/DT little/JJ yellow/JJ dog/NN) barked/VBD at/IN (NP the/DT cat/NN)) """ result.draw() # 标记模式 # 用正则表达式分块 grammer = r""" NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+} """ cp = nltk.RegexpParser(grammar) sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), ("her", "PP$"), ("long", "JJ"), ("golden", "JJ") ,("hair", "NN")] print(cp.parse(sentence)) """ (S Rapunzel/NNP let/VBD down/RP her/PP$ (NP long/JJ golden/JJ hair/NN)) """ nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")] grammar = "NP: {<NN><NN>} # Chunk two consecutive nouns" cp = nltk.RegexpParser(grammar) print(cp.parse(nouns)) """ (S (NP money/NN market/NN) fund/NN) """ # 探索文本语料库 cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}') brown = nltk.corpus.brown for sent in brown.tagged_sents(): tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'CHUNK':print(subtree) """ (CHUNK combined/VBN to/TO achieve/VB) (CHUNK continue/VB to/TO place/VB) ... """ # 缝隙 grammer = r""" NP: {<.*>+} # Chunk everything }<VBD|IN>+{ # Chink sequences of VBD and IN """ sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")] cp = nltk.RegexpParser(grammar) print(cp.parse(sentence)) """ (S the/DT little/JJ yellow/JJ dog/NN barked/VBD at/IN the/DT cat/NN) """ # 分块的表示：标记与树状图 # 7.3 开发和评估分块器 # 读取IOB格式与CoNLL2000分块语料库 text = """ he PRP B-NP accepted VBD B-NP the DT B-NP position NN I-NP of IN B-NP vice NN B-NP chairman NN I-NP of IN B-PP Carlyle NNP B-NP Group NNP I-NP , , O a DT B-NP merchant NN I-NP banking NN I-NP concern NN I-NP . . O """ nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw() from nltk.corpus import conll2000 print(conll2000.chunked_sents('train.txt')[99]) """ (S (PP Over/IN) (NP a/DT cup/NN) (PP of/IN) (NP coffee/NN) ,/, (NP Mr./NNP Stone/NNP) (VP told/VBD) (NP his/PRP$ story/NN) ./.) """ print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]) """ (S Over/IN (NP a/DT cup/NN) of/IN (NP coffee/NN) ,/, (NP Mr./NNP Stone/NNP) told/VBD (NP his/PRP$ story/NN) ./.) """ # 简单评估与基准 from nltk.corpus import conll2000 cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) print(cp.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 43.4%% Precision: 0.0%% Recall: 0.0%% F-Measure: 0.0%% """ grammar = r"NP: {<CDJNP].*>+}" cp = nltk.RegexpParser(grammar) print(cp.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 43.4%% Precision: 0.0%% Recall: 0.0%% F-Measure: 0.0%% """ class UnigramChunker(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.UnigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] return nltk.chunk.tree2conlltags(conlltags) from nltk.corpus import conll2000 test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) unigram_chunker = UnigramChunker(train_sents) print(unigram_chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 43.4%% Precision: 0.0%% Recall: 0.0%% F-Measure: 0.0%% """ postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves())) print(unigram_chunker.tagger.tag(postags)) """[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), ...""" class BigramChunker(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.UnigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] return nltk.chunk.tree2conlltags(conlltags) bigram_chunker = BigramChunker(train_sents) print(bigram_chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 43.4%% Precision: 0.0%% Recall: 0.0%% F-Measure: 0.0%% """ # 训练基于分类器的分块器 class ConsecutiveNPChunkTagger(nltk.TaggerI): def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = npchunk_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) # windows 采用megam算法需要自己编译megam模块，比较麻烦，可以尝试使用iis、gis之类的算法 self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='iis', trace=0) def tag(self, sentence): history = [] for i, word in enumerate(sentence): featureset = npchunk_features(sentence, i, history) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history) class ConsecutiveNPChunker(nltk.ChunkParserI): def __init__(self, train_sents): tagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = ConsecutiveNPChunkTagger(tagged_sents) def parse(self, sentence): tagged_sents = self.tagger.tag(sentence) conlltags = [(w,t,c) for ((w,t),c) in tagged_sents] return nltk.chunk.conlltags2tree(conlltags) # 不同的npchunk_features ''' def npchunk_features(sentence, i, history): word, pos = sentence[i] return {"pos": pos} chunker = ConsecutiveNPChunker(train_sents) # 这个训练比较耗时 print(chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 92.9%% Precision: 79.9%% Recall: 86.8%% F-Measure: 83.2%% """ def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i-1] return {"pos": pos, "prevpos": prevpos} chunker = ConsecutiveNPChunker(train_sents) print(chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 93.6%% Precision: 82.0%% Recall: 87.2%% F-Measure: 84.6%% """ def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i-1] return {"pos": pos, "word": word, "prevpos":prevpos} chunker = ConsecutiveNPChunker(train_sents) print(chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 94.6%% Precision: 84.6%% Recall: 89.8%% F-Measure: 87.1%% """ def npchunk_features(sentence, i, history): word, pos = sentence[i] if i == 0: prevword, prevpos = "<START>", "<START>" else: prevword, prevpos = sentence[i-1] if i == len(sentence)-1: nextword, nextpos = "<END>", "<END>" else: nextword, nextpos = sentence[i+1] return {"pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos, "prevpos+pos": "%s+%s" % (prevpos, pos), "pos+nextpos": "%s+%s" % (pos, nextpos), "tags-since-dt": tags_since_dt(sentence,i)} def tags_since_dt(sentence, i): tags = set() for word, pos in sentence[:i]: if pos == 'DT': tags = set() else: tags.add(pos) return '+'.join(sorted(tags)) chunker = ConsecutiveNPChunker(train_sents) print(chunker.evaluate(test_sents)) """ ChunkParse score: IOB Accuracy: 96.0%% Precision: 88.3%% Recall: 91.1%% F-Measure: 89.7%% """ # 7.5 语言结构中的递归 # 用级联分块器构建嵌套结构 grammar = r""" NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments CLAUSE: {<NP><VP>} # Chunk NP, VP """ cp = nltk.RegexpParser(grammar) sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] print(cp.parse(sentence)) """ (S (NP Mary/NN) saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))) """ sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] print(cp.parse(sentence)) """ (S (NP John/NNP) thinks/VBZ (NP Mary/NN) saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))) """ cp = nltk.RegexpParser(grammar, loop=2) print(cp.parse(sentence)) """ (S (NP John/NNP) thinks/VBZ (CLAUSE (NP Mary/NN) (VP saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))))) """ # 树状图 tree1 = nltk.Tree('NP', ['Alice']) print(tree1) # (NP Alice) tree2 = nltk.Tree('NP', ['the', 'rabbit']) print(tree2) # (NP the rabbit) tree3 = nltk.Tree('VP', ['chased', tree2]) print(tree3) # (VP chased (NP the rabbit)) tree4 = nltk.Tree('S', [tree1, tree3]) print(tree4) # (S (NP Alice) (VP chased (NP the rabbit))) print(tree4[1]) # (VP chased (NP the rabbit)) print(tree4[1].label()) # VP print(tree4.leaves()) # ['Alice', 'chased', 'the', 'rabbit'] print(tree4[1][1][1]) # rabbit tree3.draw() # 递归函数遍历树状图 def traverse(t): try: t.label() except AttributeError: print(t, end="") else: # Now we know that t.node is defined print("(", t.label(), end='') for child in t: traverse(child) print(")", end='') import nltk #t = nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))') t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))') traverse(t) """ (S (NP Mary/NN) saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))) (S (NP John/NNP) thinks/VBZ (NP Mary/NN) saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))) (S (NP John/NNP) thinks/VBZ (CLAUSE (NP Mary/NN) (VP saw/VBD (CLAUSE (NP the/DT cat/NN) (VP sit/VB (PP on/IN (NP the/DT mat/NN))))))) (NP Alice) (NP the rabbit) (VP chased (NP the rabbit)) (S (NP Alice) (VP chased (NP the rabbit))) (VP chased (NP the rabbit)) VP ['Alice', 'chased', 'the', 'rabbit'] rabbit ( S( NPAlice)( VPchased( NPtherabbit))) """ # 命名实体识别(NES) sent = nltk.corpus.treebank.tagged_sents()[22] print(nltk.ne_chunk(sent, binary=True)) """ (S The/DT (NE U.S./NNP) is/VBZ one/CD of/IN the/DT few/JJ industrialized/VBN nations/NNS that/WDT *T*-7/-NONE- does/VBZ n't/RB have/VB a/DT higher/JJR standard/NN of/IN regulation/NN for/IN the/DT smooth/JJ ,/, needle-like/JJ fibers/NNS such/JJ as/IN crocidolite/NN that/WDT *T*-1/-NONE- are/VBP classified/VBN *-5/-NONE- as/IN amphobiles/NNS ,/, according/VBG to/TO (NE Brooke/NNP) T./NNP Mossman/NNP ,/, a/DT professor/NN of/IN pathlogy/NN at/IN the/DT (NE University/NNP) of/IN (NE Vermont/NNP College/NNP) of/IN (NE Medicine/NNP) ./.) """ ''' # 7.6 关系抽取 import re, nltk IN = re.compile(r'.*\bin\b(?!\b.+ing)') for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'): for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(nltk.sem.relextract.rtuple(rel)) """ [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia'] [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo'] [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington'] [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington'] [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles'] [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] [ORG: 'WGBH'] 'in' [LOC: 'Boston'] [ORG: 'Bastille Opera'] 'in' [LOC: 'Paris'] [ORG: 'Omnicom'] 'in' [LOC: 'New York'] [ORG: 'DDB Needham'] 'in' [LOC: 'New York'] [ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York'] [ORG: 'BBDO South'] 'in' [LOC: 'Atlanta'] [ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta'] """ from nltk.corpus import conll2002 vnv = """ ( # is/V| # was/V| # werd/V| # wordt/V # ) # .* # van/Prep # """ VAN = re.compile(vnv, re.VERBOSE) for doc in conll2002.chunked_sents('ned.train'): for r in nltk.sem.relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): print(nltk.sem.relextract.clause(r, relsym="VAN")) """ VAN("cornet_d'elzius", 'buitenlandse_handel') VAN('johan_rottiers', 'kardinaal_van_roey_instituut') VAN('annie_lennox', 'eurythmics') """

转载请注明原文地址: https://www.6miu.com/read-43870.html

技术

最新回复(0)