def main(): args = docopt(""" Usage: corpus2pairs.py [options] <corpus> <vocab> <pairs> Options: --win NUM Window size [default: 2] --sub NUM Subsampling threshold [default: 0] --ngram_word NUM (Center) word vocabulary includes grams of 1st to nth order [default: 1] --ngram_context NUM Context vocabulary includes grams of 1st to nth order [default: 1] --threads_num NUM The number of threads [default: 8] --overlap Whether overlaping pairs are allowed or not """) print "**********************" print "corpus2pairs" threads_num = int(args['--threads_num']) threads_list = [] for i in xrange(0, threads_num): #extract pairs from corpus through multipule threads thread = multiprocessing.Process(target=c2p, args=(args, i))//开启多线程,把参数和线程id都传到线程中去 thread.start()//开启线程 threads_list.append(thread) for thread in threads_list: thread.join()//等待线程结束 print "corpus2pairs finished"
def c2p(args, tid): pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')//每个线程都会把抽取的pairs写到自己的文件中,最后再合并。 win = int(args['--win']) subsample = float(args['--sub']) sub = subsample != 0 ngram_word = int(args['--ngram_word'])//中心词考虑到几阶的ngram ngram_context = int(args['--ngram_context'])//上下文考虑到几阶的ngram overlap = args['--overlap']//pair中的ngram如果重叠,这个pair是否还抽取出来 threads_num = int(args['--threads_num']) vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)//读取corpus2vocab生成的词典,只有单词在词典中才会被抽取出来 train_uni_num = 0 #number of (unigram) tokens in corpus//计算语料中token的数量,忽略了低频词,所以不准 for w, c in vocab.iteritems(): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus if tid == 0: print 'vocabulary size: ' + str(len(vocab)) print 'number of training words (uni-grams): ' + str(train_uni_num) print 'number of training n-grams: ' + str(train_num) subsample *= train_uni_num//做subsampling,和hyperwords中的代码类似 if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in vocab.iteritems() if count > subsample]) #subsampling technique rnd = Random(17) with open(args['<corpus>']) as f: line_num = 0 if tid == 0: print str(line_num/1000**1) + "K lines processed." for line in f://遍历文件的每一行 line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: print "\x1b[1A" + str(line_num/1000) + "K lines processed." if line_num % threads_num != tid://每个线程处理指定的行 continue tokens = line.strip().split()//得到每行的单词列表 for i in xrange(len(tokens)): #loop for each position in a line//对单词列表的每一个位置循环 for gram_word in xrange(1, ngram_word+1): #loop for grams of different orders in (center) word //在每个位置上面考虑中心词ngram word = getNgram(tokens, i, gram_word)//中心词 word = check_word(word, vocab, sub, subsampler, rnd)//用一系列逻辑判断这个中心词是否合格会被留下,不合格的话以其为中心词的pair也可以被扔掉 if word is None: continue for gram_context in xrange(1, ngram_context+1): #loop for grams of different orders in context//对于一个中心词考虑期上下文的ngram start = i - win + gram_word - 1//根据窗口大小确定上下文ngram的起点和终点 end = i + win - gram_context + 1 for j in xrange(start, end + 1)://遍历上下文窗口的所有ngram if overlap: if i == j and gram_word == gram_context://判断中心词上下文是不是一个ngram continue else: if len(set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0://判断中心词上下文是否有重叠 continue context = getNgram(tokens, j, gram_context)//得到上下文特征 context = check_word(context, vocab, sub, subsampler, rnd)//通过一系列逻辑判断要不要这个上下文,不要的话pair也就不要了 if context is None: continue pairs_file.write(word + ' ' + context + "\n") #write pairs to the file//最终版满足条件的pair写到文件中。 pairs_file.close()
def check_word(t, vocab, sub, subsampler, rnd): #discard tokens if t is None://首先单词不是None return None if sub://然后不被subsampling掉,过滤高频词的手段 t = t if t not in subsampler or rnd.random() > subsampler[t] else None if t is None: return None t = t if t in vocab else None//单词要在之前从corpus2vocab中生成的词典 return t