NLTK12《Python自然语言处理》code11 语言数据管理

xiaoxiao2021-02-28 132

语言数据管理

# -*- coding: utf-8 -*- # win10 python3.5.3/python3.6.1 nltk3.2.4 # 《Python自然语言处理》 11 语言数据管理 # pnlp11.py import nltk # 11.1 语料库结构：案例研究 phonetic = nltk.corpus.timit.phones('dr1-fvmh0/sa1') print(phonetic) # ['h#', 'sh', 'iy', 'hv', 'ae', 'dcl', 'y', 'ix',... res = nltk.corpus.timit.word_times('dr1-fvmh0/sa1') print(res) # [('she', 7812, 10610), ('had', 10610, 14496),... timitdict = nltk.corpus.timit.transcription_dict() res = timitdict['greasy'] + timitdict['wash'] + timitdict['water'] print(res) # ['g', 'r', 'iy1', 's', 'iy', 'w', 'ao1', 'sh', 'w', 'ao1', 't', 'axr'] print(phonetic[17:30]) # ['g', 'r', 'iy', 's', 'iy', 'w', 'aa', 'sh', 'epi', 'w', 'aa', 'dx', 'ax'] res = nltk.corpus.timit.spkrinfo('dr1-fvmh0') print(res) # SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', ... # 主要设计特点 # 基本数据类型 # 11.2 语料库生命周期 # 创建语料库的3种方案 # 质量控制 s1 = "00000010000000001000000" s2 = "00000001000000010000000" s3 = "00001000000000000001000" res = nltk.windowdiff(s1, s1, 3) print(res) # 0.0 res = nltk.windowdiff(s1, s2, 3) print(res) # 0.19047619047619047 res = nltk.windowdiff(s2, s3, 3) print(res) # 0.5714285714285714 # 维护与演变 # 11.3 数据采集 # 从网上获取数据 # 从文字处理器文件获取数据 """dict.htm.html sleep [sli:p] v.i a condition of body and mind ...<o:p></o:p> """ import re legal_pos = set(['n', 'v.t.', 'v.i.', 'adj', 'det']) pattern = re.compile(r"'font-size:11.0pt'>([a-z.]+)<") document = open('dict.htm.html').read() used_pos = set(re.findall(pattern, document)) illegal_pos = used_pos.difference(legal_pos) print(list(illegal_pos)) # ['v.i'] # 例11-1 将Microsoft Word 创建的HTML转换成CSV import bs4, lxml def lexical_data(html_file): SEP = '_INTRY' html = open(html_file).read() html = re.sub(r'<p', SEP + '<p', html) # text = nltk.clean_html(html) text = bs4.BeautifulSoup(html, "lxml").get_text() text = ' '.join(text.split()) for entry in text.split(SEP): if entry.count(' ') > 2: yield entry.split(' ', 3) import csv writer = csv.writer(open("dict1.csv", "w")) writer.writerows(lexical_data("dict.htm.html")) # dict1.csv # sleep,[sli:p],v.i,a condition of body and mind ... # 从电子表格和数据库中获取数据 """ # dict.csv "sleep", "sli:p", "v.i", "a condition of body and mind ..." "walk", "wo:k", "v.intr", "progress by lifting and setting down each foot ..." "wake", "weik", "intrans", "cease to sleep" """ import csv lexicon = csv.reader(open("dict.csv")) pairs = [(lexeme, defn) for (lexeme, _, _, defn) in lexicon] lexemes, defns = zip(*pairs) defn_words = set(w for defn in defns for w in defn.split()) res = sorted(defn_words.difference(lexemes)) print(res) """ ['"a', '"cease', '"progress', '..."', 'and', 'body', 'by', 'condition', 'down', 'each', 'foot', 'lifting', 'mind', 'of', 'setting', 'sleep"', 'to'] """ # 转换数据格式 idx = nltk.Index((defn_word, lexeme) for (lexeme, defn) in pairs for defn_word in nltk.word_tokenize(defn) if len(defn_word) > 3) idx_file = open("dict.idx", "w") for word in sorted(idx): idx_words = ', '.join(idx[word]) idx_line = "%s: %s\n" % (word, idx_words) idx_file.write(idx_line) idx_file.close() """dict.idx body: sleep cease: wake condition: sleep down: walk each: walk foot: walk lifting: walk mind: sleep progress: walk setting: walk sleep: wake """ # 决定要包含的标注层 # 标准和工具 # 处理濒危语言时特别注意事项 # 有错误 mappings = [('ph', 'f'), ('ght', 't'), ('^kn', 'n'), ('qu', 'kw'), ('[aeiou]+', 'a'), (r'(.)\1', r'\1')] def signature(word): for patt, repl in mappings: word = re.sub(patt, repl, word) pieces = re.findall('[aeiou]+', word) return ''.join(char for piece in pieces for char in sorted(piece))[:8] print(signature('illefent')) # aaa print(signature('ebsekwieous')) # aaa print(signature('nuculerr')) # aaa signatures = nltk.Index((signature(w), w) for w in nltk.corpus.words.words()) res = signatures[signature('nuculerr')] print(res) # ['Aaronic', 'abaca', 'abacay', 'abacist', 'abaction', ... from nltk.metrics import edit_distance def rank(word, wordlist): ranked = sorted((edit_distance(word, w), w) for w in wordlist) return [word for (_, word) in ranked] def fuzzy_spell(word): sig = signature(word) if sig in signatures: return rank(word, signatures[sig]) else: return [] print(fuzzy_spell('illefent')) # ['idlement', 'element', 'idleset', ... print(fuzzy_spell('ebsekwieous')) # formulaic', 'formular', 'formulary',... print(fuzzy_spell('nucular')) # ['Ducula', 'Nucula', 'cumular', 'facular',... # 11.4 使用XML # 语言结构中使用XML # XML的作用 # ElementTree接口 # 使用ElementTree访问Toolbox数据 # 格式化条目 # 11.5 使用Toolbox数据 # 为每个条目增加字段 # 11.6 使用OLAC元数据描述语言资源

转载请注明原文地址: https://www.6miu.com/read-60691.html

技术

最新回复(0)