NLTK09《Python自然语言处理》code08 分析句子结构

xiaoxiao2021-02-28 112

分析句子结构

# -*- coding: utf-8 -*- # win10 python3.5.3/python3.6.1 nltk3.2.4 # 《Python自然语言处理》 08 分析句子结构 # pnlp08.py # 8.1 一些语法困境 # 语言数据和无限可能性 # 普遍存在的歧义 import nltk groucho_grammar = nltk.CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] parser = nltk.ChartParser(groucho_grammar) trees = parser.parse(sent) for tree in trees: print(tree) """ (S (NP I) (VP (VP (V shot) (NP (Det an) (N elephant))) (PP (P in) (NP (Det my) (N pajamas))))) (S (NP I) (VP (V shot) (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas)))))) """ # 8.2 文法的用途 # 8.3 上下文无关文法 grammar1 = nltk.CFG.fromstring(""" S -> NP VP VP -> V NP | V NP PP PP -> P NP V -> "saw" | "ate" | "walked" NP -> "John" | "Mary" | "Bob" | Det N | Det N PP Det -> "a" | "an" | "the" | "my" N -> "man" | "dog" | "cat" | "telescope" | "park" P -> "in" | "on" | "by" | "with" """) sent = "Mary saw Bob".split() rd_parser = nltk.RecursiveDescentParser(grammar1) for tree in rd_parser.parse(sent): print(tree) """(S (NP Mary) (VP (V saw) (NP Bob)))""" # 编写自己的文法 grammar1 = nltk.data.load('file:mygrammar.cfg') sent = "Mary saw Bob".split() rd_parser = nltk.RecursiveDescentParser(grammar1) for tree in rd_parser.parse(sent): print(tree) """(S (NP Mary) (VP (V saw) (NP Bob)))""" # 例8-2 递归的上下文无关文法 grammar2 = nltk.CFG.fromstring(""" S -> NP VP NP -> Det Nom | PropN Nom -> Adj Nom | N VP -> V Adj | V NP | V S | V NP PP PP -> P NP PropN -> 'Buster' | 'Chatterer' | 'Joe' Det -> 'the' | 'a' N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log' Adj -> 'angry' | 'frightened' | 'little' | 'tall' V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put' P -> 'on' """) # 8.4 上下文无关文法分析 # 递归下降解析 # 移进-归约分析 # 左角落解析器 # 符合句子规则的子串表 # 例8-3 使用符合语句规则的子串表接收器 def init_wfst(tokens, grammar): numtokens = len(tokens) wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)] for i in range(numtokens): productions = grammar.productions(rhs=tokens[i]) wfst[i][i+1] = productions[0].lhs() return wfst def complete_wfst(wfst, tokens, grammar, trace=False): index = dict((p.rhs(), p.lhs()) for p in grammar.productions()) numtokens = len(tokens) for span in range(2, numtokens + 1): for start in range(numtokens + 1): end = start + span if end > numtokens: break for mid in range(start+1, end): nt1, nt2 = wfst[start][mid], wfst[mid][end] if nt1 and nt2 and (nt1, nt2) in index: wfst[start][end] = index[(nt1, nt2)] if trace: print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" %(start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end)) return wfst def display(wfst, tokens): print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))])) for i in range(len(wfst)-1): print("%d " %i, end="") for j in range(1, len(wfst)): print("%-4s" % (wfst[i][j] or '.'), end="") print("") tokens = "I shot an elephant in my pajamas".split() wfst0 = init_wfst(tokens, groucho_grammar) display(wfst0, tokens) """ WFST 1 2 3 4 5 6 7 0 NP . . . . . . 1 . V . . . . . 2 . . Det . . . . 3 . . . N . . . 4 . . . . P . . 5 . . . . . Det . 6 . . . . . . N """ wfst1 = complete_wfst(wfst0, tokens, groucho_grammar) display(wfst1, tokens) """ WFST 1 2 3 4 5 6 7 0 NP . . S . . S 1 . V . VP . . VP 2 . . Det NP . . . 3 . . . N . . . 4 . . . . P . PP 5 . . . . . Det NP 6 . . . . . . N """ wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=True) """ [2] Det [3] N [4] ==> [2] NP [4] [5] Det [6] N [7] ==> [5] NP [7] [1] V [2] NP [4] ==> [1] VP [4] [4] P [5] NP [7] ==> [4] PP [7] [0] NP [1] VP [4] ==> [0] S [4] [1] VP [4] PP [7] ==> [1] VP [7] [0] NP [1] VP [7] ==> [0] S [7] """ # 8.5 依存关系和依存文法 import nltk # groucho_dep_grammar = nltk.parse_dependency_grammar( groucho_dep_grammar = nltk.grammar.DependencyGrammar.fromstring(""" 'shot' -> 'I' | 'elephant' | 'in' 'elephant' -> 'an' | 'in' 'in' -> 'pajamas' 'pajamas' -> 'my' """) print(groucho_dep_grammar) """ Dependency grammar with 7 productions 'shot' -> 'I' 'shot' -> 'elephant' 'shot' -> 'in' 'elephant' -> 'an' 'elephant' -> 'in' 'in' -> 'pajamas' 'pajamas' -> 'my' """ pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar) sent = 'I shot an elephant in my pajamas'.split() trees = pdp.parse(sent) for tree in trees: print(tree) """ (shot I (elephant an (in (pajamas my)))) (shot I (elephant an) (in (pajamas my))) """ # 配价与词汇 # 扩大规模 # 8.6 文法开发 # 树库和文法 from nltk.corpus import treebank t = treebank.parsed_sents('wsj_0001.mrg')[0] print(t) """ (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .)) """ # 例8-4 搜索树库找出句子的补语 def filter(tree): child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)] return (tree.label() == 'VP') and ('S' in child_nodes) from nltk.corpus import treebank res = [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)] print(res) """[Tree('VP', [Tree('VBN', ['named']), ...""" import nltk entries = nltk.corpus.ppattach.attachments('training') table = nltk.defaultdict(lambda: nltk.defaultdict(set)) for entry in entries: key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2 table[key][entry.attachment].add(entry.verb) for key in sorted(table): if len(table[key]) > 1: print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V'])) """ %-below-level N: ['left'] V: ['be'] %-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was'] ... """ nltk.corpus.sinica_treebank.parsed_sents()[3450].draw() import nltk # 有害的歧义 grammar = nltk.CFG.fromstring(""" S -> NP V NP NP -> NP Sbar Sbar -> NP V NP -> 'fish' V -> 'fish' """) tokens = ["fish"] * 5 cp = nltk.ChartParser(grammar) for tree in cp.parse(tokens): print(tree) """ (S (NP fish) (V fish) (NP (NP fish) (Sbar (NP fish) (V fish)))) (S (NP (NP fish) (Sbar (NP fish) (V fish))) (V fish) (NP fish)) """ # 加权文法 # 例8-5 宾州树库样本中give和gave的用法 def give(t): return (t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP' and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP') and ('give' in t[0].leaves() or 'gave' in t[0].leaves())) def sent(t): return ' '.join(token for token in t.leaves() if token[0] not in '*-0') def print_node(t, width): output = "%s %s: %s / %s: %s" %\ (sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2])) if len(output) > width: output = output[:width] + "..." print(output) for tree in nltk.corpus.treebank.parsed_sents(): for t in tree.subtrees(give): print_node(t, 72) """ gave NP: the chefs / NP: a standing ovation give NP: advertisers / NP: discounts for maintaining or increasing ad sp... give NP: it / PP-DTV: to the politicians gave NP: them / NP: similar help give NP: them / NP: give NP: only French history questions / PP-DTV: to students in a Europe... give NP: federal judges / NP: a raise give NP: consumers / NP: the straight scoop on the U.S. waste crisis gave NP: Mitsui / NP: access to a high-tech medical product give NP: Mitsubishi / NP: a window on the U.S. glass industry give NP: much thought / PP-DTV: to the rates she was receiving , nor to ... give NP: your Foster Savings Institution / NP: the gift of hope and free... give NP: market operators / NP: the authority to suspend trading in futu... gave NP: quick approval / PP-DTV: to $ 3.18 billion in supplemental appr... give NP: the Transportation Department / NP: up to 50 days to review any... give NP: the president / NP: such power give NP: me / NP: the heebie-jeebies give NP: holders / NP: the right , but not the obligation , to buy a cal... gave NP: Mr. Thomas / NP: only a `` qualified '' rating , rather than ``... give NP: the president / NP: line-item veto power """ # 概率上下文无关文法 # 例8-6 定义一个概率上下文无关文法(PCFG) import nltk grammar = nltk.PCFG.fromstring(""" S -> NP VP [1.0] VP -> TV NP [0.4] VP -> IV [0.3] VP -> DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """) print(grammar) """ Grammar with 9 productions (start state = S) S -> NP VP [1.0] VP -> TV NP [0.4] VP -> IV [0.3] VP -> DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2] """ viterbi_parser = nltk.ViterbiParser(grammar) trees = viterbi_parser.parse(['Jack', 'saw', 'telescopes']) for tree in trees: print(tree) """ (S (NP Jack) (VP (TV saw) (NP telescopes))) (p=0.064) """

转载请注明原文地址: https://www.6miu.com/read-43618.html

技术

最新回复(0)