分析句子结构
import nltk
groucho_grammar = nltk.CFG.fromstring(
"""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = [
'I',
'shot',
'an',
'elephant',
'in',
'my',
'pajamas']
parser = nltk.ChartParser(groucho_grammar)
trees = parser.parse(sent)
for tree
in trees:
print(tree)
"""
(S
(NP I)
(VP
(VP (V shot) (NP (Det an) (N elephant)))
(PP (P in) (NP (Det my) (N pajamas)))))
(S
(NP I)
(VP
(V shot)
(NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
"""
grammar1 = nltk.CFG.fromstring(
"""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
""")
sent =
"Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree
in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""
grammar1 = nltk.data.load(
'file:mygrammar.cfg')
sent =
"Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree
in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""
grammar2 = nltk.CFG.fromstring(
"""
S -> NP VP
NP -> Det Nom | PropN
Nom -> Adj Nom | N
VP -> V Adj | V NP | V S | V NP PP
PP -> P NP
PropN -> 'Buster' | 'Chatterer' | 'Joe'
Det -> 'the' | 'a'
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
Adj -> 'angry' | 'frightened' | 'little' | 'tall'
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
P -> 'on'
""")
def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [[
None for i
in range(numtokens+
1)]
for j
in range(numtokens+
1)]
for i
in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+
1] = productions[
0].lhs()
return wfst
def complete_wfst(wfst, tokens, grammar, trace=False):
index = dict((p.rhs(), p.lhs())
for p
in grammar.productions())
numtokens = len(tokens)
for span
in range(
2, numtokens +
1):
for start
in range(numtokens +
1):
end = start + span
if end > numtokens:
break
for mid
in range(start+
1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if nt1
and nt2
and (nt1, nt2)
in index:
wfst[start][end] = index[(nt1, nt2)]
if trace:
print(
"[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]"
%(start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end))
return wfst
def display(wfst, tokens):
print(
'\nWFST ' +
' '.join([(
"%-4d" % i)
for i
in range(
1, len(wfst))]))
for i
in range(len(wfst)-
1):
print(
"%d " %i, end=
"")
for j
in range(
1, len(wfst)):
print(
"%-4s" % (wfst[i][j]
or '.'), end=
"")
print(
"")
tokens =
"I shot an elephant in my pajamas".split()
wfst0 = init_wfst(tokens, groucho_grammar)
display(wfst0, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . . . . .
1 . V . . . . .
2 . . Det . . . .
3 . . . N . . .
4 . . . . P . .
5 . . . . . Det .
6 . . . . . . N
"""
wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
display(wfst1, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . S . . S
1 . V . VP . . VP
2 . . Det NP . . .
3 . . . N . . .
4 . . . . P . PP
5 . . . . . Det NP
6 . . . . . . N
"""
wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=
True)
"""
[2] Det [3] N [4] ==> [2] NP [4]
[5] Det [6] N [7] ==> [5] NP [7]
[1] V [2] NP [4] ==> [1] VP [4]
[4] P [5] NP [7] ==> [4] PP [7]
[0] NP [1] VP [4] ==> [0] S [4]
[1] VP [4] PP [7] ==> [1] VP [7]
[0] NP [1] VP [7] ==> [0] S [7]
"""
import nltk
groucho_dep_grammar = nltk.grammar.DependencyGrammar.fromstring(
"""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
""")
print(groucho_dep_grammar)
"""
Dependency grammar with 7 productions
'shot' -> 'I'
'shot' -> 'elephant'
'shot' -> 'in'
'elephant' -> 'an'
'elephant' -> 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
"""
pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent =
'I shot an elephant in my pajamas'.split()
trees = pdp.parse(sent)
for tree
in trees:
print(tree)
"""
(shot I (elephant an (in (pajamas my))))
(shot I (elephant an) (in (pajamas my)))
"""
from nltk.corpus
import treebank
t = treebank.parsed_sents(
'wsj_0001.mrg')[
0]
print(t)
"""
(S
(NP-SBJ
(NP (NNP Pierre) (NNP Vinken))
(, ,)
(ADJP (NP (CD 61) (NNS years)) (JJ old))
(, ,))
(VP
(MD will)
(VP
(VB join)
(NP (DT the) (NN board))
(PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
(NP-TMP (NNP Nov.) (CD 29))))
(. .))
"""
def filter(tree):
child_nodes = [child.label()
for child
in tree
if isinstance(child, nltk.Tree)]
return (tree.label() ==
'VP')
and (
'S' in child_nodes)
from nltk.corpus
import treebank
res = [subtree
for tree
in treebank.parsed_sents()
for subtree
in tree.subtrees(filter)]
print(res)
"""[Tree('VP', [Tree('VBN', ['named']), ..."""
import nltk
entries = nltk.corpus.ppattach.attachments(
'training')
table = nltk.defaultdict(
lambda: nltk.defaultdict(set))
for entry
in entries:
key = entry.noun1 +
'-' + entry.prep +
'-' + entry.noun2
table[key][entry.attachment].add(entry.verb)
for key
in sorted(table):
if len(table[key]) >
1:
print(key,
'N:', sorted(table[key][
'N']),
'V:', sorted(table[key][
'V']))
"""
%-below-level N: ['left'] V: ['be']
%-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was']
...
"""
nltk.corpus.sinica_treebank.parsed_sents()[
3450].draw()
import nltk
grammar = nltk.CFG.fromstring(
"""
S -> NP V NP
NP -> NP Sbar
Sbar -> NP V
NP -> 'fish'
V -> 'fish'
""")
tokens = [
"fish"] *
5
cp = nltk.ChartParser(grammar)
for tree
in cp.parse(tokens):
print(tree)
"""
(S (NP fish) (V fish) (NP (NP fish) (Sbar (NP fish) (V fish))))
(S (NP (NP fish) (Sbar (NP fish) (V fish))) (V fish) (NP fish))
"""
def give(t):
return (t.label() ==
'VP' and len(t) >
2 and t[
1].label() ==
'NP'
and (t[
2].label() ==
'PP-DTV' or t[
2].label() ==
'NP')
and (
'give' in t[
0].leaves()
or 'gave' in t[
0].leaves()))
def sent(t):
return ' '.join(token
for token
in t.leaves()
if token[
0]
not in '*-0')
def print_node(t, width):
output =
"%s %s: %s / %s: %s" %\
(sent(t[
0]), t[
1].label(), sent(t[
1]), t[
2].label(), sent(t[
2]))
if len(output) > width:
output = output[:width] +
"..."
print(output)
for tree
in nltk.corpus.treebank.parsed_sents():
for t
in tree.subtrees(give):
print_node(t,
72)
"""
gave NP: the chefs / NP: a standing ovation
give NP: advertisers / NP: discounts for maintaining or increasing ad sp...
give NP: it / PP-DTV: to the politicians
gave NP: them / NP: similar help
give NP: them / NP:
give NP: only French history questions / PP-DTV: to students in a Europe...
give NP: federal judges / NP: a raise
give NP: consumers / NP: the straight scoop on the U.S. waste crisis
gave NP: Mitsui / NP: access to a high-tech medical product
give NP: Mitsubishi / NP: a window on the U.S. glass industry
give NP: much thought / PP-DTV: to the rates she was receiving , nor to ...
give NP: your Foster Savings Institution / NP: the gift of hope and free...
give NP: market operators / NP: the authority to suspend trading in futu...
gave NP: quick approval / PP-DTV: to $ 3.18 billion in supplemental appr...
give NP: the Transportation Department / NP: up to 50 days to review any...
give NP: the president / NP: such power
give NP: me / NP: the heebie-jeebies
give NP: holders / NP: the right , but not the obligation , to buy a cal...
gave NP: Mr. Thomas / NP: only a `` qualified '' rating , rather than ``...
give NP: the president / NP: line-item veto power
"""
import nltk
grammar = nltk.PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
""")
print(grammar)
"""
Grammar with 9 productions (start state = S)
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
"""
viterbi_parser = nltk.ViterbiParser(grammar)
trees = viterbi_parser.parse([
'Jack',
'saw',
'telescopes'])
for tree
in trees:
print(tree)
"""
(S (NP Jack) (VP (TV saw) (NP telescopes))) (p=0.064)
"""