语言数据管理
import nltk
phonetic = nltk.corpus.timit.phones(
'dr1-fvmh0/sa1')
print(phonetic)
res = nltk.corpus.timit.word_times(
'dr1-fvmh0/sa1')
print(res)
timitdict = nltk.corpus.timit.transcription_dict()
res = timitdict[
'greasy'] + timitdict[
'wash'] + timitdict[
'water']
print(res)
print(phonetic[
17:
30])
res = nltk.corpus.timit.spkrinfo(
'dr1-fvmh0')
print(res)
s1 =
"00000010000000001000000"
s2 =
"00000001000000010000000"
s3 =
"00001000000000000001000"
res = nltk.windowdiff(s1, s1,
3)
print(res)
res = nltk.windowdiff(s1, s2,
3)
print(res)
res = nltk.windowdiff(s2, s3,
3)
print(res)
"""dict.htm.html
<p class=MsoNormal>sleep
<span style='mso-spacerun:yes'></span>
[<span class=S;ellE>sli:p</span>]
<span style='mso-spancerun:yes'></span>
<b><span style='font-size:11.0pt'>v.i</span></b>
<span style='mso-spacerun:yes'></span>
<i>a condition of body and mind ...<o:p></o:p></i>
</p>
"""
import re
legal_pos = set([
'n',
'v.t.',
'v.i.',
'adj',
'det'])
pattern = re.compile(
r"'font-size:11.0pt'>([a-z.]+)<")
document = open(
'dict.htm.html').read()
used_pos = set(re.findall(pattern, document))
illegal_pos = used_pos.difference(legal_pos)
print(list(illegal_pos))
import bs4, lxml
def lexical_data(html_file):
SEP =
'_INTRY'
html = open(html_file).read()
html = re.sub(
r'<p', SEP +
'<p', html)
text = bs4.BeautifulSoup(html,
"lxml").get_text()
text =
' '.join(text.split())
for entry
in text.split(SEP):
if entry.count(
' ') >
2:
yield entry.split(
' ',
3)
import csv
writer = csv.writer(open(
"dict1.csv",
"w"))
writer.writerows(lexical_data(
"dict.htm.html"))
"""
# dict.csv
"sleep", "sli:p", "v.i", "a condition of body and mind ..."
"walk", "wo:k", "v.intr", "progress by lifting and setting down each foot ..."
"wake", "weik", "intrans", "cease to sleep"
"""
import csv
lexicon = csv.reader(open(
"dict.csv"))
pairs = [(lexeme, defn)
for (lexeme, _, _, defn)
in lexicon]
lexemes, defns = zip(*pairs)
defn_words = set(w
for defn
in defns
for w
in defn.split())
res = sorted(defn_words.difference(lexemes))
print(res)
"""
['"a', '"cease', '"progress', '..."', 'and', 'body', 'by', 'condition', 'down',
'each', 'foot', 'lifting', 'mind', 'of', 'setting', 'sleep"', 'to']
"""
idx = nltk.Index((defn_word, lexeme)
for (lexeme, defn)
in pairs
for defn_word
in nltk.word_tokenize(defn)
if len(defn_word) >
3)
idx_file = open(
"dict.idx",
"w")
for word
in sorted(idx):
idx_words =
', '.join(idx[word])
idx_line =
"%s: %s\n" % (word, idx_words)
idx_file.write(idx_line)
idx_file.close()
"""dict.idx
body: sleep
cease: wake
condition: sleep
down: walk
each: walk
foot: walk
lifting: walk
mind: sleep
progress: walk
setting: walk
sleep: wake
"""
mappings = [(
'ph',
'f'), (
'ght',
't'), (
'^kn',
'n'), (
'qu',
'kw'),
(
'[aeiou]+',
'a'), (
r'(.)\1',
r'\1')]
def signature(word):
for patt, repl
in mappings:
word = re.sub(patt, repl, word)
pieces = re.findall(
'[aeiou]+', word)
return ''.join(char
for piece
in pieces
for char
in sorted(piece))[:
8]
print(signature(
'illefent'))
print(signature(
'ebsekwieous'))
print(signature(
'nuculerr'))
signatures = nltk.Index((signature(w), w)
for w
in nltk.corpus.words.words())
res = signatures[signature(
'nuculerr')]
print(res)
from nltk.metrics
import edit_distance
def rank(word, wordlist):
ranked = sorted((edit_distance(word, w), w)
for w
in wordlist)
return [word
for (_, word)
in ranked]
def fuzzy_spell(word):
sig = signature(word)
if sig
in signatures:
return rank(word, signatures[sig])
else:
return []
print(fuzzy_spell(
'illefent'))
print(fuzzy_spell(
'ebsekwieous'))
print(fuzzy_spell(
'nucular'))